mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00
chore: typo fix (#1465)
* typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> --------- Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>
This commit is contained in:
parent
3afbe6c969
commit
a097ccd8d5
@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
)
|
)
|
||||||
return _txt
|
return _txt
|
||||||
|
|
||||||
# restore original HTML by removing previouly added markers
|
# restore original HTML by removing previously added markers
|
||||||
for regex in [
|
for regex in [
|
||||||
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
||||||
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
||||||
|
@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
|
|
||||||
# Common styles for bullet and numbered lists.
|
# Common styles for bullet and numbered lists.
|
||||||
# "List Bullet", "List Number", "List Paragraph"
|
# "List Bullet", "List Number", "List Paragraph"
|
||||||
# Identify wether list is a numbered list or not
|
# Identify whether list is a numbered list or not
|
||||||
# is_numbered = "List Bullet" not in paragraph.style.name
|
# is_numbered = "List Bullet" not in paragraph.style.name
|
||||||
is_numbered = False
|
is_numbered = False
|
||||||
p_style_id, p_level = self._get_label_and_level(paragraph)
|
p_style_id, p_level = self._get_label_and_level(paragraph)
|
||||||
|
@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
|||||||
super().__init__(in_doc, path_or_stream)
|
super().__init__(in_doc, path_or_stream)
|
||||||
self.path_or_stream = path_or_stream
|
self.path_or_stream = path_or_stream
|
||||||
|
|
||||||
# Initialize the root of the document hiearchy
|
# Initialize the root of the document hierarchy
|
||||||
self.root: Optional[NodeItem] = None
|
self.root: Optional[NodeItem] = None
|
||||||
|
|
||||||
self.valid = False
|
self.valid = False
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
"""Backend to parse patents from the United States Patent Office (USPTO).
|
"""Backend to parse patents from the United States Patent Office (USPTO).
|
||||||
|
|
||||||
The parsers included in this module can handle patent grants pubished since 1976 and
|
The parsers included in this module can handle patent grants published since 1976 and
|
||||||
patent applications since 2001.
|
patent applications since 2001.
|
||||||
The original files can be found in https://bulkdata.uspto.gov.
|
The original files can be found in https://bulkdata.uspto.gov.
|
||||||
"""
|
"""
|
||||||
@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
|
|||||||
)
|
)
|
||||||
|
|
||||||
elif name == self.Element.PARAGRAPH.value and text:
|
elif name == self.Element.PARAGRAPH.value and text:
|
||||||
# remmove blank spaces added in paragraphs
|
# remove blank spaces added in paragraphs
|
||||||
text = re.sub("\\s+", " ", text)
|
text = re.sub("\\s+", " ", text)
|
||||||
if self.Element.ABSTRACT.value in self.property:
|
if self.Element.ABSTRACT.value in self.property:
|
||||||
self.abstract = (
|
self.abstract = (
|
||||||
@ -1697,7 +1697,7 @@ class XmlTable:
|
|||||||
class HtmlEntity:
|
class HtmlEntity:
|
||||||
"""Provide utility functions to get the HTML entities of styled characters.
|
"""Provide utility functions to get the HTML entities of styled characters.
|
||||||
|
|
||||||
This class has been developped from:
|
This class has been developed from:
|
||||||
https://unicode-table.com/en/html-entities/
|
https://unicode-table.com/en/html-entities/
|
||||||
https://www.w3.org/TR/WD-math-970515/table03.html
|
https://www.w3.org/TR/WD-math-970515/table03.html
|
||||||
"""
|
"""
|
||||||
@ -1896,7 +1896,7 @@ class HtmlEntity:
|
|||||||
"""Get an HTML entity of a greek letter in ISO 8879.
|
"""Get an HTML entity of a greek letter in ISO 8879.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
The text to transform, as an ISO 8879 entitiy.
|
The text to transform, as an ISO 8879 entity.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
The HTML entity representing a greek letter. If the input text is not
|
The HTML entity representing a greek letter. If the input text is not
|
||||||
|
@ -521,7 +521,7 @@ def convert( # noqa: C901
|
|||||||
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
||||||
pipeline_options.generate_page_images = True
|
pipeline_options.generate_page_images = True
|
||||||
pipeline_options.generate_picture_images = (
|
pipeline_options.generate_picture_images = (
|
||||||
True # FIXME: to be deprecated in verson 3
|
True # FIXME: to be deprecated in version 3
|
||||||
)
|
)
|
||||||
pipeline_options.images_scale = 2
|
pipeline_options.images_scale = 2
|
||||||
|
|
||||||
|
@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
|
|||||||
tcells = table_cluster.cells
|
tcells = table_cluster.cells
|
||||||
tokens = []
|
tokens = []
|
||||||
for c in tcells:
|
for c in tcells:
|
||||||
# Only allow non empty stings (spaces) into the cells of a table
|
# Only allow non empty strings (spaces) into the cells of a table
|
||||||
if len(c.text.strip()) > 0:
|
if len(c.text.strip()) > 0:
|
||||||
new_cell = copy.deepcopy(c)
|
new_cell = copy.deepcopy(c)
|
||||||
new_cell.rect = BoundingRectangle.from_bounding_box(
|
new_cell.rect = BoundingRectangle.from_bounding_box(
|
||||||
|
@ -151,7 +151,7 @@ class TesseractOcrModel(BaseOcrModel):
|
|||||||
script = map_tesseract_script(script)
|
script = map_tesseract_script(script)
|
||||||
lang = f"{self.script_prefix}{script}"
|
lang = f"{self.script_prefix}{script}"
|
||||||
|
|
||||||
# Check if the detected languge is present in the system
|
# Check if the detected language is present in the system
|
||||||
if lang not in self._tesserocr_languages:
|
if lang not in self._tesserocr_languages:
|
||||||
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
||||||
msg += " However this language is not installed in your system and will be ignored."
|
msg += " However this language is not installed in your system and will be ignored."
|
||||||
|
@ -569,7 +569,7 @@
|
|||||||
"The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
|
"The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
|
||||||
"In this notebook, we will leverage:\n",
|
"In this notebook, we will leverage:\n",
|
||||||
"- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
|
"- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
|
||||||
"- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n",
|
"- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vector store.\n",
|
||||||
"- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
|
"- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
|
||||||
"\n",
|
"\n",
|
||||||
"Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."
|
"Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."
|
||||||
|
@ -206,7 +206,7 @@
|
|||||||
"source": [
|
"source": [
|
||||||
"Points to notice looking at the output chunks below:\n",
|
"Points to notice looking at the output chunks below:\n",
|
||||||
"- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
|
"- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
|
||||||
"- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
|
"- Where needed, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
|
||||||
"- Where possible, we merge undersized peer chunks (see chunk 0)\n",
|
"- Where possible, we merge undersized peer chunks (see chunk 0)\n",
|
||||||
"- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)"
|
"- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)"
|
||||||
]
|
]
|
||||||
|
@ -279,7 +279,7 @@
|
|||||||
"## Use other vision models\n",
|
"## Use other vision models\n",
|
||||||
"\n",
|
"\n",
|
||||||
"The examples above can also be reproduced using other vision model.\n",
|
"The examples above can also be reproduced using other vision model.\n",
|
||||||
"The Docling options `PictureDescriptionVlmOptions` allows to speficy your favorite vision model from the Hugging Face Hub."
|
"The Docling options `PictureDescriptionVlmOptions` allows to specify your favorite vision model from the Hugging Face Hub."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -32,7 +32,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
|
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
|
||||||
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
|
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
|
||||||
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU.\n"
|
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU.\n"
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
|
@ -43,7 +43,7 @@
|
|||||||
"\n",
|
"\n",
|
||||||
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
|
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
|
||||||
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
|
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
|
||||||
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU."
|
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU."
|
||||||
]
|
]
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
@ -716,7 +716,7 @@
|
|||||||
"id": "7tGz49nfUegG"
|
"id": "7tGz49nfUegG"
|
||||||
},
|
},
|
||||||
"source": [
|
"source": [
|
||||||
"We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documetation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
|
"We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documentation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
|
@ -137,7 +137,7 @@ This is a collection of FAQ collected from the user questions on <https://github
|
|||||||
### Some images are missing from MS Word and Powerpoint
|
### Some images are missing from MS Word and Powerpoint
|
||||||
|
|
||||||
The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
|
The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
|
||||||
If you are on other operaring systems, these images will be ignored.
|
If you are on other operating systems, these images will be ignored.
|
||||||
|
|
||||||
|
|
||||||
??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"
|
??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"
|
||||||
|
@ -37,7 +37,7 @@ docling ./input/dir --output ./scratch --abort-on-error
|
|||||||
|
|
||||||
### Setting up a `DocumentConverter`
|
### Setting up a `DocumentConverter`
|
||||||
|
|
||||||
To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
|
To accommodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
|
||||||
You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options
|
You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options
|
||||||
per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults
|
per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults
|
||||||
will be used for all `allowed_formats`.
|
will be used for all `allowed_formats`.
|
||||||
@ -151,7 +151,7 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
|
|||||||
## Inspect the converted document:
|
## Inspect the converted document:
|
||||||
conv_result.document.print_element_tree()
|
conv_result.document.print_element_tree()
|
||||||
|
|
||||||
## Iterate the elements in reading order, including hierachy level:
|
## Iterate the elements in reading order, including hierarchy level:
|
||||||
for item, level in conv_result.document.iterate_items():
|
for item, level in conv_result.document.iterate_items():
|
||||||
if isinstance(item, TextItem):
|
if isinstance(item, TextItem):
|
||||||
print(item.text)
|
print(item.text)
|
||||||
|
Loading…
x
Reference in New Issue
Block a user