# qa_core/indexing/chunking.py
from langchain_text_splitters import (
MarkdownHeaderTextSplitter,
RecursiveCharacterTextSplitter,
)
from qa_core.config.settings import get_settings
from qa_core.utils import stable_hash
# 中文优先分隔符:段落 → 换行 → 句子标点 → 短语标点 → 空格 → 字符
CHINESE_SEPARATORS = [
"\n\n", "\n",
"。", "!", "?", ";", ";", ".", "!", "?",
",", ",",
" ",
"",
]
def split_documents(documents: list[Document]) -> tuple[list[Document], list[str]]:
"""将文档切成 Parent-Child 双层结构。
Child 进 Milvus 做精确检索,Parent 存 metadata 给 LLM 看完整上下文。
"""
settings = get_settings()
# 两个 splitter:Parent 大块(1000),Child 小块(350)
parent_splitter = RecursiveCharacterTextSplitter(
chunk_size=settings.parent_chunk_size, # 1000
chunk_overlap=settings.parent_overlap, # 100
separators=CHINESE_SEPARATORS,
)
child_splitter = RecursiveCharacterTextSplitter(
chunk_size=settings.child_chunk_size, # 350
chunk_overlap=settings.child_overlap, # 50
separators=CHINESE_SEPARATORS,
)
chunks: list[Document] = []
ids: list[str] = []
for doc in documents:
content_type = str(doc.metadata.get("content_type", "")).lower()
file_type = str(doc.metadata.get("file_type", "")).lower()
# ── 分支 1:表格行不切分 ──
# 表格 loader 已经把一行转成"表头 + 行号 + 单元格键值"的完整语义单元。
# 再切会导致"金额"和"状态"分到两个 chunk 里,检索到金额却找不到审批人。
if content_type.startswith("table"):
parent_docs = [doc]
# ── 分支 2:Markdown 先按标题切再递归切 ──
elif file_type == ".md":
header_splitter = MarkdownHeaderTextSplitter(
headers_to_split_on=[("#", "h1"), ("##", "h2"), ("###", "h3")]
)
# 按 #/##/### 标题切分为章节,保留 h1/h2/h3 到 metadata
header_docs = header_splitter.split_text(doc.page_content)
for header_doc in header_docs:
header_doc.metadata.update(doc.metadata) # 补回文件级 metadata
# 每个章节再递归切 Parent
parent_docs = parent_splitter.split_documents(header_docs)
# ── 分支 3:普通文本直接递归切 Parent ──
else:
parent_docs = parent_splitter.split_documents([doc])
# ── 每个 Parent 再切 Child,并用稳定哈希生成 ID ──
for parent_doc in parent_docs:
parent_content = parent_doc.page_content
# 稳定哈希 = SHA256(scenario_id + kb_version + embedding_model_version
# + chunk_schema_version + doc_id + parent_content)
# 同一个文件未变化时 ID 不变;文件变化时 ID 自动变化,配合 manifest 更新
parent_id = stable_hash(
parent_doc.metadata.get("scenario_id"),
parent_doc.metadata.get("kb_version"),
parent_doc.metadata.get("embedding_model_version"),
parent_doc.metadata.get("chunk_schema_version"),
parent_doc.metadata.get("doc_id"),
parent_content,
)
child_docs = child_splitter.split_documents([parent_doc])
for child_doc in child_docs:
chunk_id = stable_hash(parent_id, child_doc.page_content)
metadata = dict(child_doc.metadata or {})
metadata.update({
"parent_id": parent_id,
"parent_content": parent_content,
"chunk_id": chunk_id,
})
chunks.append(Document(page_content=child_doc.page_content, metadata=metadata))
ids.append(chunk_id)
return chunks, ids