mirror of
https://github.com/deepset-ai/haystack.git
synced 2026-01-06 03:57:19 +00:00
fix: update tests and release notes
This commit is contained in:
parent
3ffa3b70a6
commit
307f8340b2
@ -942,20 +942,22 @@
|
||||
"\n",
|
||||
"os.environ[\"OPENROUTER_API_KEY\"] = \"\"\n",
|
||||
"\n",
|
||||
"def show(chunk): # simple streaming callback\n",
|
||||
"\n",
|
||||
"def show(chunk): # simple streaming callback\n",
|
||||
" print(chunk.content, end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"client = OpenRouterChatGenerator(\n",
|
||||
" model=\"microsoft/mai-ds-r1:free\", # let OpenRouter pick a model\n",
|
||||
" model=\"microsoft/mai-ds-r1:free\", # let OpenRouter pick a model\n",
|
||||
" # streaming_callback=show,\n",
|
||||
" generation_kwargs={\n",
|
||||
" \"provider\": {\"sort\": \"throughput\"}, # pick the fastest provider\n",
|
||||
" }\n",
|
||||
" \"provider\": {\"sort\": \"throughput\"} # pick the fastest provider\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"response = client.run([ChatMessage.from_user(\"In which year was the Turing machine invented\")])\n",
|
||||
"\n",
|
||||
"print (response)\n"
|
||||
"print(response)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1004,7 +1006,7 @@
|
||||
" response = requests.get(url)\n",
|
||||
" with open(os.path.join(\"doc_zh\", filename), \"wb\") as f:\n",
|
||||
" f.write(response.content)\n",
|
||||
" print(f\"Downloaded: {filename}\")\n"
|
||||
" print(f\"Downloaded: {filename}\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -1042,7 +1044,7 @@
|
||||
"\n",
|
||||
"url = \"https://raw.githubusercontent.com/mc112611/PI-ka-pi/main/test_chinese_document_spliter.py\"\n",
|
||||
"file_name = \"test_chinese_document_spliter.py\"\n",
|
||||
"urllib.request.urlretrieve(url, file_name)\n"
|
||||
"urllib.request.urlretrieve(url, file_name)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -2049,11 +2051,13 @@
|
||||
"# Add MIME type for docx documents\n",
|
||||
"# 文档路由器,通过 MIME 匹配文档类型,捕捉不同类型的文件\n",
|
||||
"# File type router routes documents by MIME type to appropriate converters\n",
|
||||
"file_type_router = FileTypeRouter(mime_types=[\n",
|
||||
" \"text/plain\", \n",
|
||||
" \"application/pdf\", \n",
|
||||
" \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\"\n",
|
||||
"])\n",
|
||||
"file_type_router = FileTypeRouter(\n",
|
||||
" mime_types=[\n",
|
||||
" \"text/plain\",\n",
|
||||
" \"application/pdf\",\n",
|
||||
" \"application/vnd.openxmlformats-officedocument.wordprocessingml.document\",\n",
|
||||
" ]\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# 下面是多个 converter 对象,详情参考:https://docs.haystack.deepset.ai/docs/converters\n",
|
||||
"# Below are converter components, see docs for more: https://docs.haystack.deepset.ai/docs/converters\n",
|
||||
@ -2081,11 +2085,7 @@
|
||||
"# 文档分割器,按照 word 进行切分,长度为 150 个单词,窗口重叠为 20\n",
|
||||
"# Document splitter: splits text by word, with a chunk size of 150 and overlap of 20\n",
|
||||
"document_splitter = chinese_DocumentSpliter(\n",
|
||||
" split_by=\"word\", \n",
|
||||
" split_length=150, \n",
|
||||
" split_overlap=20,\n",
|
||||
" language='zh', \n",
|
||||
" respect_sentence_boundary=False\n",
|
||||
" split_by=\"word\", split_length=150, split_overlap=20, language=\"zh\", respect_sentence_boundary=False\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"# 使用 BAAI 的中文嵌入模型\n",
|
||||
@ -2113,7 +2113,9 @@
|
||||
"# Set up pipeline connections between components\n",
|
||||
"my_pipe.connect(\"file_type_router.text/plain\", \"text_file_converter.sources\")\n",
|
||||
"my_pipe.connect(\"file_type_router.application/pdf\", \"pypdf_converter.sources\")\n",
|
||||
"my_pipe.connect(\"file_type_router.application/vnd.openxmlformats-officedocument.wordprocessingml.document\", \"docx_converter.sources\")\n",
|
||||
"my_pipe.connect(\n",
|
||||
" \"file_type_router.application/vnd.openxmlformats-officedocument.wordprocessingml.document\", \"docx_converter.sources\"\n",
|
||||
")\n",
|
||||
"my_pipe.connect(\"text_file_converter\", \"document_joiner\")\n",
|
||||
"my_pipe.connect(\"pypdf_converter\", \"document_joiner\")\n",
|
||||
"my_pipe.connect(\"docx_converter\", \"document_joiner\")\n",
|
||||
@ -2127,10 +2129,9 @@
|
||||
"# 执行管道,处理指定路径下的所有文件\n",
|
||||
"# Run the pipeline on all files under the input directory\n",
|
||||
"result = my_pipe.run(\n",
|
||||
" {\"file_type_router\": {\"sources\": list(Path(input_dir).glob(\"**/*\"))}},\n",
|
||||
" include_outputs_from={\"document_splitter\"}\n",
|
||||
" {\"file_type_router\": {\"sources\": list(Path(input_dir).glob(\"**/*\"))}}, include_outputs_from={\"document_splitter\"}\n",
|
||||
")\n",
|
||||
"# print(result)\n"
|
||||
"# print(result)"
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -2181,12 +2182,13 @@
|
||||
"\n",
|
||||
"from haystack.components.embedders import SentenceTransformersTextEmbedder\n",
|
||||
"from haystack_integrations.components.retrievers.chroma import ChromaEmbeddingRetriever\n",
|
||||
"from haystack.components.builders import PromptBuilder,ChatPromptBuilder\n",
|
||||
"\n",
|
||||
"from haystack.components.builders import PromptBuilder, ChatPromptBuilder\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"os.environ[\"OPENROUTER_API_KEY\"] = \"\"\n",
|
||||
"def show(chunk): # simple streaming callback\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"def show(chunk): # simple streaming callback\n",
|
||||
" print(chunk.content, end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
@ -2210,31 +2212,23 @@
|
||||
"\n",
|
||||
"# other questions:\n",
|
||||
"# 《1984》:\n",
|
||||
" # 1.“双重思想”是什么?\n",
|
||||
" # 2.温斯顿是做什么工作的?\n",
|
||||
" # 3.什么是“思想罪”?\n",
|
||||
" # 4.老大哥永远正确吗?\n",
|
||||
" # 5.什么是“101房间”?\n",
|
||||
"# 1.“双重思想”是什么?\n",
|
||||
"# 2.温斯顿是做什么工作的?\n",
|
||||
"# 3.什么是“思想罪”?\n",
|
||||
"# 4.老大哥永远正确吗?\n",
|
||||
"# 5.什么是“101房间”?\n",
|
||||
"# 《Ramanujan》:\n",
|
||||
" # 1.拉马努金的启蒙数学书是什么?\n",
|
||||
" # 2.拉马努金于哪一年去世的?\n",
|
||||
" # 3. 拉马努金认为他的公式是谁给他的?\n",
|
||||
"# 1.拉马努金的启蒙数学书是什么?\n",
|
||||
"# 2.拉马努金于哪一年去世的?\n",
|
||||
"# 3. 拉马努金认为他的公式是谁给他的?\n",
|
||||
"# 《Oppenheimer》:\n",
|
||||
" # 1.奥本海默是犹太人吗?\n",
|
||||
" # 2.奥本海默哪一年发明原子弹?\n",
|
||||
" # 3.奥本海默名言是什么?\n",
|
||||
"question = (\n",
|
||||
" \"奥本海默是犹太人吗?\"\n",
|
||||
")\n",
|
||||
"# 1.奥本海默是犹太人吗?\n",
|
||||
"# 2.奥本海默哪一年发明原子弹?\n",
|
||||
"# 3.奥本海默名言是什么?\n",
|
||||
"question = \"奥本海默是犹太人吗?\"\n",
|
||||
"\n",
|
||||
"# 将问题输入进embedder组件的“text”参数,以及prompt的“question”参数,llm组件的大模型配置参数字典,修改生成的最大长度为350\n",
|
||||
"result=RAG_pipe.run(\n",
|
||||
" {\n",
|
||||
" \"embedder\": {\"text\": question},\n",
|
||||
" \"retriever\":{\"top_k\":3}\n",
|
||||
"\n",
|
||||
" }\n",
|
||||
")\n",
|
||||
"result = RAG_pipe.run({\"embedder\": {\"text\": question}, \"retriever\": {\"top_k\": 3}})\n",
|
||||
"\n",
|
||||
"pprint(result[\"retriever\"][\"documents\"])\n",
|
||||
"# for i in result[\"retriever\"][\"documents\"]:\n",
|
||||
@ -2303,19 +2297,22 @@
|
||||
"\n",
|
||||
"os.environ[\"OPENROUTER_API_KEY\"] = \"\"\n",
|
||||
"\n",
|
||||
"def show(chunk): # simple streaming callback\n",
|
||||
"\n",
|
||||
"def show(chunk): # simple streaming callback\n",
|
||||
" print(chunk.content, end=\"\", flush=True)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"client = OpenRouterChatGenerator(\n",
|
||||
" model=\"microsoft/mai-ds-r1:free\", # let OpenRouter pick a model\n",
|
||||
" model=\"microsoft/mai-ds-r1:free\", # let OpenRouter pick a model\n",
|
||||
" # streaming_callback=show,\n",
|
||||
" generation_kwargs={\n",
|
||||
" \"provider\": {\"sort\": \"throughput\"}, # pick the fastest provider\n",
|
||||
" }\n",
|
||||
" \"provider\": {\"sort\": \"throughput\"} # pick the fastest provider\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"template = [ChatMessage.from_user(\"\"\"\n",
|
||||
"template = [\n",
|
||||
" ChatMessage.from_user(\"\"\"\n",
|
||||
"根据给出的文本来回答问题.\n",
|
||||
"\n",
|
||||
"文本:\n",
|
||||
@ -2325,20 +2322,19 @@
|
||||
"\n",
|
||||
"问题: {{ question }}\n",
|
||||
"你的回答:\n",
|
||||
"\"\"\")]\n",
|
||||
"\"\"\")\n",
|
||||
"]\n",
|
||||
"\n",
|
||||
"prompt_builder=ChatPromptBuilder(template=template)\n",
|
||||
"llm= OpenRouterChatGenerator(\n",
|
||||
" model=\"microsoft/mai-ds-r1:free\", # let OpenRouter pick a model\n",
|
||||
"prompt_builder = ChatPromptBuilder(template=template)\n",
|
||||
"llm = OpenRouterChatGenerator(\n",
|
||||
" model=\"microsoft/mai-ds-r1:free\", # let OpenRouter pick a model\n",
|
||||
" # streaming_callback=show,\n",
|
||||
" generation_kwargs={\n",
|
||||
" \"provider\": {\"sort\": \"throughput\"}, # pick the fastest provider\n",
|
||||
" }\n",
|
||||
" \"provider\": {\"sort\": \"throughput\"} # pick the fastest provider\n",
|
||||
" },\n",
|
||||
")\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"# 创建管道节点\n",
|
||||
"RAG_pipe = Pipeline()\n",
|
||||
"RAG_pipe.add_component(\"text_embedder\", SentenceTransformersTextEmbedder(model=\"BAAI/bge-small-zh-v1.5\"))\n",
|
||||
@ -2357,10 +2353,7 @@
|
||||
"\n",
|
||||
"response = RAG_pipe.run({\"text_embedder\": {\"text\": question}, \"prompt_builder\": {\"question\": question}})\n",
|
||||
"\n",
|
||||
"print(response[\"llm\"][\"replies\"][0].text)\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"\n"
|
||||
"print(response[\"llm\"][\"replies\"][0].text)"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
||||
@ -1,8 +1,7 @@
|
||||
'''
|
||||
"""
|
||||
haystack-ai == 2.12.1
|
||||
|
||||
'''
|
||||
|
||||
"""
|
||||
|
||||
from haystack.components.preprocessors import DocumentSplitter
|
||||
from copy import deepcopy
|
||||
@ -18,10 +17,8 @@ from haystack.utils import deserialize_callable, serialize_callable
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# mapping of split by character, 'function' and 'sentence' don't split by character
|
||||
_CHARACTER_SPLIT_BY_MAPPING = {
|
||||
"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"}
|
||||
chinese_tokenizer_coarse = hanlp.load(
|
||||
hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
|
||||
_CHARACTER_SPLIT_BY_MAPPING = {"page": "\f", "passage": "\n\n", "period": ".", "word": " ", "line": "\n"}
|
||||
chinese_tokenizer_coarse = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
|
||||
chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
|
||||
# 加载中文的句子切分器
|
||||
split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
|
||||
@ -29,7 +26,6 @@ split_sent = hanlp.load(hanlp.pretrained.eos.UD_CTB_EOS_MUL)
|
||||
|
||||
@component
|
||||
class chinese_DocumentSpliter(DocumentSplitter):
|
||||
|
||||
def __init__(self, *args, particle_size: Literal["coarse", "fine"] = "coarse", **kwargs):
|
||||
super(chinese_DocumentSpliter, self).__init__(*args, **kwargs)
|
||||
|
||||
@ -44,12 +40,12 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
|
||||
def _split_by_character(self, doc) -> List[Document]:
|
||||
split_at = _CHARACTER_SPLIT_BY_MAPPING[self.split_by]
|
||||
if self.language == 'zh' and self.particle_size == "coarse":
|
||||
if self.language == "zh" and self.particle_size == "coarse":
|
||||
units = chinese_tokenizer_coarse(doc.content)
|
||||
|
||||
if self.language == 'zh' and self.particle_size == "fine":
|
||||
if self.language == "zh" and self.particle_size == "fine":
|
||||
units = chinese_tokenizer_fine(doc.content)
|
||||
if self.language == 'en':
|
||||
if self.language == "en":
|
||||
units = doc.content.split(split_at)
|
||||
# Add the delimiter back to all units except the last one
|
||||
for i in range(len(units) - 1):
|
||||
@ -74,11 +70,7 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
for sentence in sentences:
|
||||
start = text.find(sentence, start)
|
||||
end = start + len(sentence)
|
||||
results.append({
|
||||
'sentence': sentence + '\n',
|
||||
'start': start,
|
||||
'end': end
|
||||
})
|
||||
results.append({"sentence": sentence + "\n", "start": start, "end": end})
|
||||
start = end
|
||||
|
||||
return results
|
||||
@ -120,17 +112,17 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
# chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
|
||||
for sentence_idx, sentence in enumerate(sentences):
|
||||
current_chunk.append(sentence)
|
||||
if language == 'zh' and particle_size == "coarse":
|
||||
if language == "zh" and particle_size == "coarse":
|
||||
chunk_word_count += len(chinese_tokenizer_coarse(sentence))
|
||||
next_sentence_word_count = (
|
||||
len(chinese_tokenizer_coarse(
|
||||
sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0
|
||||
len(chinese_tokenizer_coarse(sentences[sentence_idx + 1]))
|
||||
if sentence_idx < len(sentences) - 1
|
||||
else 0
|
||||
)
|
||||
if language == 'zh' and particle_size == "fine":
|
||||
if language == "zh" and particle_size == "fine":
|
||||
chunk_word_count += len(chinese_tokenizer_fine(sentence))
|
||||
next_sentence_word_count = (
|
||||
len(chinese_tokenizer_fine(
|
||||
sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0
|
||||
len(chinese_tokenizer_fine(sentences[sentence_idx + 1])) if sentence_idx < len(sentences) - 1 else 0
|
||||
)
|
||||
|
||||
# Number of words in the current chunk plus the next sentence is larger than the split_length,
|
||||
@ -143,24 +135,24 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
|
||||
# Get the number of sentences that overlap with the next chunk
|
||||
num_sentences_to_keep = chinese_DocumentSpliter._number_of_sentences_to_keep(
|
||||
sentences=current_chunk, split_length=split_length, split_overlap=split_overlap, language=language, particle_size=particle_size
|
||||
sentences=current_chunk,
|
||||
split_length=split_length,
|
||||
split_overlap=split_overlap,
|
||||
language=language,
|
||||
particle_size=particle_size,
|
||||
)
|
||||
# Set up information for the new chunk
|
||||
if num_sentences_to_keep > 0:
|
||||
# Processed sentences are the ones that are not overlapping with the next chunk
|
||||
processed_sentences = current_chunk[:-
|
||||
num_sentences_to_keep]
|
||||
chunk_starting_page_number += sum(sent.count("\f")
|
||||
for sent in processed_sentences)
|
||||
processed_sentences = current_chunk[:-num_sentences_to_keep]
|
||||
chunk_starting_page_number += sum(sent.count("\f") for sent in processed_sentences)
|
||||
chunk_start_idx += len("".join(processed_sentences))
|
||||
# Next chunk starts with the sentences that were overlapping with the previous chunk
|
||||
current_chunk = current_chunk[-num_sentences_to_keep:]
|
||||
chunk_word_count = sum(len(s.split())
|
||||
for s in current_chunk)
|
||||
chunk_word_count = sum(len(s.split()) for s in current_chunk)
|
||||
else:
|
||||
# Here processed_sentences is the same as current_chunk since there is no overlap
|
||||
chunk_starting_page_number += sum(sent.count("\f")
|
||||
for sent in current_chunk)
|
||||
chunk_starting_page_number += sum(sent.count("\f") for sent in current_chunk)
|
||||
chunk_start_idx += len("".join(current_chunk))
|
||||
current_chunk = []
|
||||
chunk_word_count = 0
|
||||
@ -178,18 +170,21 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
def _split_by_nltk_sentence(self, doc: Document) -> List[Document]:
|
||||
split_docs = []
|
||||
|
||||
if self.language == 'zh':
|
||||
if self.language == "zh":
|
||||
result = self.chinese_sentence_split(doc.content)
|
||||
if self.language == 'en':
|
||||
result = self.sentence_splitter.split_sentences(
|
||||
doc.content) # type: ignore # None check is done in run()
|
||||
if self.language == "en":
|
||||
result = self.sentence_splitter.split_sentences(doc.content) # type: ignore # None check is done in run()
|
||||
|
||||
units = [sentence["sentence"] for sentence in result]
|
||||
|
||||
if self.respect_sentence_boundary:
|
||||
text_splits, splits_pages, splits_start_idxs = self._concatenate_sentences_based_on_word_amount(
|
||||
sentences=units, split_length=self.split_length, split_overlap=self.split_overlap, language=self.language,
|
||||
particle_size=self.particle_size)
|
||||
sentences=units,
|
||||
split_length=self.split_length,
|
||||
split_overlap=self.split_overlap,
|
||||
language=self.language,
|
||||
particle_size=self.particle_size,
|
||||
)
|
||||
else:
|
||||
text_splits, splits_pages, splits_start_idxs = self._concatenate_units(
|
||||
elements=units,
|
||||
@ -221,8 +216,7 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
splits_start_idxs: List[int] = []
|
||||
cur_start_idx = 0
|
||||
cur_page = 1
|
||||
segments = windowed(elements, n=split_length,
|
||||
step=split_length - split_overlap)
|
||||
segments = windowed(elements, n=split_length, step=split_length - split_overlap)
|
||||
|
||||
for seg in segments:
|
||||
current_units = [unit for unit in seg if unit is not None]
|
||||
@ -245,8 +239,7 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
if self.split_by == "page":
|
||||
num_page_breaks = len(processed_units)
|
||||
else:
|
||||
num_page_breaks = sum(processed_unit.count("\f")
|
||||
for processed_unit in processed_units)
|
||||
num_page_breaks = sum(processed_unit.count("\f") for processed_unit in processed_units)
|
||||
|
||||
cur_page += num_page_breaks
|
||||
|
||||
@ -279,11 +272,10 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
doc_start_idx = splits_start_idxs[i]
|
||||
previous_doc = documents[i - 1]
|
||||
previous_doc_start_idx = splits_start_idxs[i - 1]
|
||||
self._add_split_overlap_information(
|
||||
doc, doc_start_idx, previous_doc, previous_doc_start_idx)
|
||||
self._add_split_overlap_information(doc, doc_start_idx, previous_doc, previous_doc_start_idx)
|
||||
|
||||
for d in documents:
|
||||
d.content=d.content.replace(" ","")
|
||||
d.content = d.content.replace(" ", "")
|
||||
return documents
|
||||
|
||||
@staticmethod
|
||||
@ -298,26 +290,24 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
:param previous_doc: The Document that was split before the current Document.
|
||||
:param previous_doc_start_idx: The starting index of the previous Document.
|
||||
"""
|
||||
overlapping_range = (current_doc_start_idx - previous_doc_start_idx,
|
||||
len(previous_doc.content)) # type: ignore
|
||||
overlapping_range = (current_doc_start_idx - previous_doc_start_idx, len(previous_doc.content)) # type: ignore
|
||||
|
||||
if overlapping_range[0] < overlapping_range[1]:
|
||||
# type: ignore
|
||||
overlapping_str = previous_doc.content[overlapping_range[0]: overlapping_range[1]]
|
||||
overlapping_str = previous_doc.content[overlapping_range[0] : overlapping_range[1]]
|
||||
|
||||
if current_doc.content.startswith(overlapping_str): # type: ignore
|
||||
# add split overlap information to this Document regarding the previous Document
|
||||
current_doc.meta["_split_overlap"].append(
|
||||
{"doc_id": previous_doc.id, "range": overlapping_range})
|
||||
current_doc.meta["_split_overlap"].append({"doc_id": previous_doc.id, "range": overlapping_range})
|
||||
|
||||
# add split overlap information to previous Document regarding this Document
|
||||
overlapping_range = (
|
||||
0, overlapping_range[1] - overlapping_range[0])
|
||||
previous_doc.meta["_split_overlap"].append(
|
||||
{"doc_id": current_doc.id, "range": overlapping_range})
|
||||
overlapping_range = (0, overlapping_range[1] - overlapping_range[0])
|
||||
previous_doc.meta["_split_overlap"].append({"doc_id": current_doc.id, "range": overlapping_range})
|
||||
|
||||
@staticmethod
|
||||
def _number_of_sentences_to_keep(sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str) -> int:
|
||||
def _number_of_sentences_to_keep(
|
||||
sentences: List[str], split_length: int, split_overlap: int, language: str, particle_size: str
|
||||
) -> int:
|
||||
"""
|
||||
Returns the number of sentences to keep in the next chunk based on the `split_overlap` and `split_length`.
|
||||
|
||||
@ -336,10 +326,10 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
# chinese_tokenizer_fine = hanlp.load(hanlp.pretrained.tok.FINE_ELECTRA_SMALL_ZH)
|
||||
# Next overlapping Document should not start exactly the same as the previous one, so we skip the first sentence
|
||||
for sent in reversed(sentences[1:]):
|
||||
if language == 'zh' and particle_size == "coarse":
|
||||
if language == "zh" and particle_size == "coarse":
|
||||
num_words += len(chinese_tokenizer_coarse(sent))
|
||||
# num_words += len(sent.split())
|
||||
if language == 'zh' and particle_size == "fine":
|
||||
if language == "zh" and particle_size == "fine":
|
||||
num_words += len(chinese_tokenizer_fine(sent))
|
||||
# If the number of words is larger than the split_length then don't add any more sentences
|
||||
if num_words > split_length:
|
||||
@ -351,9 +341,10 @@ class chinese_DocumentSpliter(DocumentSplitter):
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
from pprint import pprint
|
||||
doc = Document(content="""月光轻轻洒落,林中传来阵阵狼嚎,夜色悄然笼罩一切。
|
||||
|
||||
doc = Document(
|
||||
content="""月光轻轻洒落,林中传来阵阵狼嚎,夜色悄然笼罩一切。
|
||||
树叶在微风中沙沙作响,影子在地面上摇曳不定。
|
||||
一只猫头鹰静静地眨了眨眼,从枝头注视着四周……
|
||||
远处的小溪哗啦啦地流淌,仿佛在向石头倾诉着什么。
|
||||
@ -362,10 +353,13 @@ if __name__ == "__main__":
|
||||
一只狐狸悄然出现,又迅速消失在灌木丛中。
|
||||
天上的星星闪烁着,仿佛在诉说古老的故事。
|
||||
时间仿佛停滞了……
|
||||
万物静候,聆听着夜的呼吸!""")
|
||||
万物静候,聆听着夜的呼吸!"""
|
||||
)
|
||||
|
||||
# splitter = chinese_DocumentSpliter(split_by="sentence", split_length=3, split_overlap=1, language='zh')
|
||||
splitter = chinese_DocumentSpliter(split_by="word", split_length=30, split_overlap=3,language='zh',respect_sentence_boundary=False)
|
||||
splitter = chinese_DocumentSpliter(
|
||||
split_by="word", split_length=30, split_overlap=3, language="zh", respect_sentence_boundary=False
|
||||
)
|
||||
splitter.warm_up()
|
||||
result = splitter.run(documents=[doc])
|
||||
|
||||
|
||||
1
original_pipeline.png
Normal file
1
original_pipeline.png
Normal file
@ -0,0 +1 @@
|
||||
image_data
|
||||
@ -1,50 +1,23 @@
|
||||
default_branch: main
|
||||
collapse_pre_releases: true
|
||||
pre_release_tag_re: (?P<pre_release>-(?:[ab]|rc)+\d*)$
|
||||
prelude_section_name: highlights
|
||||
template: |
|
||||
---
|
||||
highlights: >
|
||||
Replace this text with content to appear at the top of the section for this
|
||||
release. The highlights might repeat some details that are also present in other notes
|
||||
from the same release, that's ok. Not every release note requires highlights,
|
||||
use this section only to describe major features or notable changes.
|
||||
upgrade:
|
||||
- |
|
||||
List upgrade notes here, or remove this section.
|
||||
Upgrade notes should be rare: only list known/potential breaking changes,
|
||||
or major changes that require user action before the upgrade.
|
||||
Notes here must include steps that users can follow to 1. know if they're
|
||||
affected and 2. handle the change gracefully on their end.
|
||||
features:
|
||||
- |
|
||||
List new features here, or remove this section.
|
||||
enhancements:
|
||||
- |
|
||||
List new behavior that is too small to be
|
||||
considered a new feature, or remove this section.
|
||||
issues:
|
||||
- |
|
||||
List known issues here, or remove this section. For example, if some change is experimental or known to not work in some cases, it should be mentioned here.
|
||||
deprecations:
|
||||
- |
|
||||
List deprecations notes here, or remove this section. Deprecations should not be used for something that is removed in the release, use upgrade section instead. Deprecation should allow time for users to make necessary changes for the removal to happen in a future release.
|
||||
security:
|
||||
- |
|
||||
Add security notes here, or remove this section.
|
||||
fixes:
|
||||
- |
|
||||
Add normal bug fixes here, or remove this section.
|
||||
---
|
||||
highlights: >
|
||||
Added support for Chinese DocumentSplitter, enabling precise splitting of Chinese documents.
|
||||
|
||||
sections:
|
||||
# The highlights section is implicitly included.
|
||||
- [upgrade, Upgrade Notes]
|
||||
- [features, New Features]
|
||||
- [enhancements, Enhancement Notes]
|
||||
- [issues, Known Issues]
|
||||
- [deprecations, Deprecation Notes]
|
||||
- [security, Security Notes]
|
||||
- [fixes, Bug Fixes]
|
||||
# DO NOT REMOVE. The following sections are no longer used, but were used in the past. Keeping them here avoids reno linting errors.
|
||||
- [prelude, prelude]
|
||||
- [preview, preview]
|
||||
upgrade:
|
||||
- |
|
||||
No breaking changes or upgrade steps are required.
|
||||
|
||||
features:
|
||||
- |
|
||||
Introduced a new DocumentSplitter implementation that supports Chinese text segmentation.
|
||||
|
||||
enhancements:
|
||||
- |
|
||||
Included a notebook example demonstrating usage of the Chinese DocumentSplitter.
|
||||
|
||||
issues: []
|
||||
|
||||
deprecations: []
|
||||
|
||||
security: []
|
||||
|
||||
fixes: []
|
||||
|
||||
@ -0,0 +1,32 @@
|
||||
---
|
||||
highlights: >
|
||||
Replace this text with content to appear at the top of the section for this
|
||||
release. The highlights might repeat some details that are also present in other notes
|
||||
from the same release, that's ok. Not every release note requires highlights,
|
||||
use this section only to describe major features or notable changes.
|
||||
upgrade:
|
||||
- |
|
||||
List upgrade notes here, or remove this section.
|
||||
Upgrade notes should be rare: only list known/potential breaking changes,
|
||||
or major changes that require user action before the upgrade.
|
||||
Notes here must include steps that users can follow to 1. know if they're
|
||||
affected and 2. handle the change gracefully on their end.
|
||||
features:
|
||||
- |
|
||||
List new features here, or remove this section.
|
||||
enhancements:
|
||||
- |
|
||||
List new behavior that is too small to be
|
||||
considered a new feature, or remove this section.
|
||||
issues:
|
||||
- |
|
||||
List known issues here, or remove this section. For example, if some change is experimental or known to not work in some cases, it should be mentioned here.
|
||||
deprecations:
|
||||
- |
|
||||
List deprecations notes here, or remove this section. Deprecations should not be used for something that is removed in the release, use upgrade section instead. Deprecation should allow time for users to make necessary changes for the removal to happen in a future release.
|
||||
security:
|
||||
- |
|
||||
Add security notes here, or remove this section.
|
||||
fixes:
|
||||
- |
|
||||
Add normal bug fixes here, or remove this section.
|
||||
Loading…
x
Reference in New Issue
Block a user