Python for AI Development

学 Python 的知识记录，重点关注 AI 开发中常用的部分。

# strip() = trim()
"###hello###".strip("#")  # "hello"
valid_messages = [m for m in messages if m["content"].strip()] # if content is truthy

Unpacking

列表解构

first, *rest = [1, 2, 3, 4, 5]
print(first)  # 1
print(rest)   # [2, 3, 4, 5]  — JS 的 rest 参数 ...rest

元组解构（函数返回多个值）

def get_model_info():
    """Return model name and its parameter count"""
    return "qwen2.5:7b", 7_000_000_000

name, params = get_model_info()
print(f"模型: {name}, 参数量: {params:,}")

字典解构

for key, value in user.items():
    print(f"{key}: {value}")

List (JS: Array)

基本操作

fruits = ["apple", "banana", "cherry"]
fruits.append("date")         # JS: push()
fruits.insert(1, "avocado")   # JS: splice(1, 0, "avocado")
removed = fruits.pop()        # JS: pop()
fruits.remove("banana")       # JS: splice(indexOf("banana"), 1)

切片 (Slicing) — Python 独有的强大特性，JS 没有直接等价物

numbers = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
print(numbers[2:5])    # [2, 3, 4]   — 从索引2到5（不含5）
print(numbers[:3])     # [0, 1, 2]   — 前3个
print(numbers[-3:])    # [7, 8, 9]   — 最后3个
print(numbers[::2])    # [0, 2, 4, 6, 8] — 每隔2个取一个

列表推导式 (List Comprehension)

# 完整语法
[表达式 for 变量 in 可迭代对象 if 条件]

# 对应 JS
可迭代对象.filter(变量 => 条件).map(变量 => 表达式)

# 1. 最简单：只有 map
[x * 2 for x in [1, 2, 3]]           # [2, 4, 6]
# JS: [1,2,3].map(x => x * 2)

# 2. 带条件：filter + map
[x * 2 for x in [1, 2, 3, 4, 5] if x > 2]   # [6, 8, 10]
# JS: [1,2,3,4,5].filter(x => x > 2).map(x => x * 2)

# 3. 表达式可以是任何东西
[len(word) for word in ["hello", "hi", "hey"]]        # [5, 2, 3]
[word.upper() for word in ["hello", "world"]]          # ['HELLO', 'WORLD']
[{"role": "user", "content": msg} for msg in ["你好", "帮我写代码"]]

# 4. 嵌套循环（展平二维数组）
# 阅读顺序：从左到右，和写 for 循环的顺序一样
[item for sublist in [[1,2],[3,4]] for item in sublist]   # [1, 2, 3, 4]
# 等价于：
# for sublist in [[1,2],[3,4]]:
#     for item in sublist:
#         result.append(item)

核心理解：把它看作一个倒过来写的 for 循环 —— 先写你要什么（表达式），再写从哪来（for），最后写筛选条件（if）。

AI 场景：文档切分（简化版）

document = "这是一段很长的文档内容，需要切分成小块来做 RAG 检索。" * 10
chunk_size = 50
# Split document into chunks of chunk_size characters
chunks = [document[i:i+chunk_size] for i in range(0, len(document), chunk_size)]
print(f"文档长度: {len(document)}, 切分成 {len(chunks)} 块")

字符串切分的其他写法

text = "abcdefghijklmnop"
size = 5

# 1. 列表推导式（最常用，推荐）
# [表达式 for 变量 in 可迭代对象 if 条件]
chunks = [text[i:i+chunk_size] for i in range(0, len(text), step)]

# 2. textwrap.wrap — 标准库，专门做固定宽度切分
# ⚠️ 注意：wrap 会在空格处断行，不一定严格按 size 切，适合英文文本
import textwrap
chunks = textwrap.wrap(text, width=size)

# 3. re.findall — 正则，简洁但可读性差
import re
chunks = re.findall(f'.{{1,{size}}}', text)

# 4. itertools + 生成器 — 适合处理超大数据（惰性求值，不一次性加载到内存）
from itertools import islice
def chunk_iter(s, size):
    it = iter(s)
    while batch := ''.join(islice(it, size)):
        yield batch
chunks = list(chunk_iter(text, size))

实际 AI 开发中：文档切分一般用 LangChain 的 RecursiveCharacterTextSplitter，它会在语义边界（段落、句子）处切分，而不是硬切字符数。日常写代码用方法 1（列表推导式）就够了。

Dict (JS: Object / Map)

基本操作

user = {"name": "Alice", "age": 30, "role": "engineer"}
print(user["name"])           # JS: user.name 或 user["name"]
print(user.get("email", "N/A"))  # 安全取值，不存在返回默认值（JS 没有直接等价物）

AI 场景：解析 LLM API 返回值（这个结构你会天天见到）

api_response = {
    "choices": [
        {
            "message": {
                "role": "assistant",
                "content": "React hooks 是一种让函数组件拥有状态的机制"
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {"prompt_tokens": 10, "completion_tokens": 25, "total_tokens": 35}
}

# Extract the reply content from API response
content = api_response["choices"][0]["message"]["content"]
tokens_used = api_response["usage"]["total_tokens"]
print(f"回复: {content}")
print(f"消耗 tokens: {tokens_used}")

字典推导式 (Dict Comprehension)

# 完整语法
{key表达式: value表达式 for 变量 in 可迭代对象 if 条件}

# 1. 基本用法
{word: len(word) for word in ["hello", "world"]}
# {'hello': 5, 'world': 5}
# JS: Object.fromEntries(["hello","world"].map(w => [w, w.length]))

# 2. 带条件过滤
scores = {"Alice": 85, "Bob": 60, "Charlie": 92}
{key: value for key, value in scores.items() if value > 70}
# {'Alice': 85, 'Charlie': 92}

# 3. 值转换（类似 map）
{name: score / 100 for name, score in scores.items()}
# {'Alice': 0.85, 'Bob': 0.6, 'Charlie': 0.92}

# 4. 反转 key-value
original = {"a": 1, "b": 2, "c": 3}
{v: k for k, v in original.items()}
# {1: 'a', 2: 'b', 3: 'c'}

# 5. 从两个列表构建字典
keys = ["model", "temperature", "max_tokens"]
values = ["qwen2.5:7b", 0.7, 1000]
{k: v for k, v in zip(keys, values)}
# {'model': 'qwen2.5:7b', 'temperature': 0.7, 'max_tokens': 1000}
# 注意：这个场景直接用 dict(zip(keys, values)) 更简洁

核心理解：和列表推导式完全一样的思路，只是每次产出一个 key: value 对而不是单个值。

合并字典

defaults = {"temperature": 0.7, "max_tokens": 1000, "model": "gpt-4"}
overrides = {"temperature": 0.3, "model": "qwen2.5:7b"}
config = {**defaults,**overrides}  # JS: {...defaults, ...overrides}
print(config)

Function (JS: Fucntion)

basic function

# list[dict] 兼容
from __future__ import annotations
def basic_function(str: str, *array) -> list[dict]:

lambda （JS 匿名函数）

# 这两个完全等价
lambda x: x * 2          # Python lambda
# (x) => x * 2           // JS 箭头函数

Decorator

def decorator_function(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        # example(1, 2, name="Alice", age=30)
        # args   = (1, 2)
        # kwargs = {"name": "Alice", "age": 30}
        result = func(*args, **kwargs)
        print(f"Calling {func.__name__}({args})")
        return result
    return wrapper

@decorator_function
def basic_function(a: int, b: int) -> int: return a + b

zip （JS: generator)

titles = ["RAG 入门", "Agent 实战", "Prompt 技巧", "LLM 基础", "向量数据库"]
relevance = [0.85, 0.92, 0.78, 0.95, 0.65]
documents = zip(titles, relevance) # [("RAG 入门",0.85), ("Agent 实战", 0.92), ("Prompt 技巧",0.78),("LLM 基础", 0.95), ("向量数据库", 0.65)]
#   for title, score in documents:
#       print(title, score)       # ✅ 第一次遍历正常
#   for title, score in documents:
#       print(title, score)       # ❌ 第二次什么都不会输出，已经耗尽了
#   如果需要多次使用，先转成 list：documents = list(zip(titles, relevance))。

sorted

sorted(可迭代对象, key=排序依据, reverse=是否倒序)

Object

class & extends

class BaseLLMClient:
    def __init__(self, model: str, base_url: str):
        self.model = model
        self.base_url = base_url

    def chat(self, messages: list[dict]) -> str:
        raise NotImplementedError("Subclass must implement chat()")

    def _format_request(self, messages: list[dict]) -> dict:
        return {
            "model": self.model,
            "messages": messages,
        }

class OllamaClient(BaseLLMClient):
    def __init__(self, model: str = "qwen2.5:7b"):
        super().__init__(model, "http://localhost:11434/v1")

    def chat(self, messages: list[dict]) -> str:
        request = self._format_request(messages)
        # Simulate API call
        return f"[Ollama/{self.model}] 模拟回复: 收到 {len(messages)} 条消息"

client = OllamaClient()
messages = [{"role": "user", "content": "你好"}]
print(client.chat(messages))

dataclass & Magic Methods / Dunder Methods

from dataclasses import dataclass, field

@dataclass
class LLMConfig:
    model: str = "qwen2.5:7b"
    temperature: float = 0.7
    max_tokens: int = 1000
    stream: bool = False
    stop_sequences: list[str] = field(default_factory=list)

# 不需要写 __init__，自动生成！
config1 = LLMConfig()
config2 = LLMConfig(model="deepseek-chat", temperature=0.3)
print(config1)
print(config2)
print(f"config1 == config2: {config1 == config2}")  # 自动生成 __eq__

@dataclass
class SearchResults:
    """Container for search results with Python magic methods"""
    results: list[Document] = field(default_factory=list)

    def __len__(self) -> int:
        """Enable len(results) — like JS: results.length"""
        return len(self.results)

    def __getitem__(self, index):
        """Enable results[0] — like JS array indexing"""
        return self.results[index]

    def __iter__(self):
        """Enable for doc in results — like JS Symbol.iterator"""
        return iter(self.results)

    def __contains__(self, source: str) -> bool:
        """Enable 'file.md' in results — membership test"""
        return any(doc.source == source for doc in self.results)

# 使用起来像内置类型一样自然
results = SearchResults(results=[
    Document("内容1", "doc1.md"),
    Document("内容2", "doc2.md"),
    Document("内容3", "doc3.md"),
])

print(f"结果数量: {len(results)}")
print(f"第一个: {results[0]}")
print(f"包含 doc2.md: {'doc2.md' in results}")

Unpacking​

列表解构​

元组解构（函数返回多个值）​

字典解构​

List (JS: Array)​

基本操作​

切片 (Slicing) — Python 独有的强大特性，JS 没有直接等价物​

列表推导式 (List Comprehension)​

AI 场景：文档切分（简化版）​

字符串切分的其他写法​

Dict (JS: Object / Map)​

基本操作​

AI 场景：解析 LLM API 返回值（这个结构你会天天见到）​

字典推导式 (Dict Comprehension)​

合并字典​

Function (JS: Fucntion)​

basic function​

lambda （JS 匿名函数）​

Decorator​

zip （JS: generator)​

sorted​

Object​

class & extends​

dataclass & Magic Methods / Dunder Methods​

Unpacking

列表解构

元组解构（函数返回多个值）

字典解构

List (JS: Array)

基本操作

切片 (Slicing) — Python 独有的强大特性，JS 没有直接等价物

列表推导式 (List Comprehension)

AI 场景：文档切分（简化版）

字符串切分的其他写法

Dict (JS: Object / Map)

基本操作

AI 场景：解析 LLM API 返回值（这个结构你会天天见到）

字典推导式 (Dict Comprehension)

合并字典

Function (JS: Fucntion)

basic function

lambda （JS 匿名函数）

Decorator

zip （JS: generator)

sorted

Object

class & extends

dataclass & Magic Methods / Dunder Methods