mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00
chore: typo fix (#1465)
* typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> --------- Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>
This commit is contained in:
parent
3afbe6c969
commit
a097ccd8d5
@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
|
||||
)
|
||||
return _txt
|
||||
|
||||
# restore original HTML by removing previouly added markers
|
||||
# restore original HTML by removing previously added markers
|
||||
for regex in [
|
||||
rf"<pre>\s*<code>\s*{_START_MARKER}",
|
||||
rf"{_STOP_MARKER}\s*</code>\s*</pre>",
|
||||
|
@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):
|
||||
|
||||
# Common styles for bullet and numbered lists.
|
||||
# "List Bullet", "List Number", "List Paragraph"
|
||||
# Identify wether list is a numbered list or not
|
||||
# Identify whether list is a numbered list or not
|
||||
# is_numbered = "List Bullet" not in paragraph.style.name
|
||||
is_numbered = False
|
||||
p_style_id, p_level = self._get_label_and_level(paragraph)
|
||||
|
@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
|
||||
super().__init__(in_doc, path_or_stream)
|
||||
self.path_or_stream = path_or_stream
|
||||
|
||||
# Initialize the root of the document hiearchy
|
||||
# Initialize the root of the document hierarchy
|
||||
self.root: Optional[NodeItem] = None
|
||||
|
||||
self.valid = False
|
||||
|
@ -1,6 +1,6 @@
|
||||
"""Backend to parse patents from the United States Patent Office (USPTO).
|
||||
|
||||
The parsers included in this module can handle patent grants pubished since 1976 and
|
||||
The parsers included in this module can handle patent grants published since 1976 and
|
||||
patent applications since 2001.
|
||||
The original files can be found in https://bulkdata.uspto.gov.
|
||||
"""
|
||||
@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
|
||||
)
|
||||
|
||||
elif name == self.Element.PARAGRAPH.value and text:
|
||||
# remmove blank spaces added in paragraphs
|
||||
# remove blank spaces added in paragraphs
|
||||
text = re.sub("\\s+", " ", text)
|
||||
if self.Element.ABSTRACT.value in self.property:
|
||||
self.abstract = (
|
||||
@ -1697,7 +1697,7 @@ class XmlTable:
|
||||
class HtmlEntity:
|
||||
"""Provide utility functions to get the HTML entities of styled characters.
|
||||
|
||||
This class has been developped from:
|
||||
This class has been developed from:
|
||||
https://unicode-table.com/en/html-entities/
|
||||
https://www.w3.org/TR/WD-math-970515/table03.html
|
||||
"""
|
||||
@ -1896,7 +1896,7 @@ class HtmlEntity:
|
||||
"""Get an HTML entity of a greek letter in ISO 8879.
|
||||
|
||||
Args:
|
||||
The text to transform, as an ISO 8879 entitiy.
|
||||
The text to transform, as an ISO 8879 entity.
|
||||
|
||||
Returns:
|
||||
The HTML entity representing a greek letter. If the input text is not
|
||||
|
@ -521,7 +521,7 @@ def convert( # noqa: C901
|
||||
if image_export_mode != ImageRefMode.PLACEHOLDER:
|
||||
pipeline_options.generate_page_images = True
|
||||
pipeline_options.generate_picture_images = (
|
||||
True # FIXME: to be deprecated in verson 3
|
||||
True # FIXME: to be deprecated in version 3
|
||||
)
|
||||
pipeline_options.images_scale = 2
|
||||
|
||||
|
@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
|
||||
tcells = table_cluster.cells
|
||||
tokens = []
|
||||
for c in tcells:
|
||||
# Only allow non empty stings (spaces) into the cells of a table
|
||||
# Only allow non empty strings (spaces) into the cells of a table
|
||||
if len(c.text.strip()) > 0:
|
||||
new_cell = copy.deepcopy(c)
|
||||
new_cell.rect = BoundingRectangle.from_bounding_box(
|
||||
|
@ -151,7 +151,7 @@ class TesseractOcrModel(BaseOcrModel):
|
||||
script = map_tesseract_script(script)
|
||||
lang = f"{self.script_prefix}{script}"
|
||||
|
||||
# Check if the detected languge is present in the system
|
||||
# Check if the detected language is present in the system
|
||||
if lang not in self._tesserocr_languages:
|
||||
msg = f"Tesseract detected the script '{script}' and language '{lang}'."
|
||||
msg += " However this language is not installed in your system and will be ignored."
|
||||
|
@ -569,7 +569,7 @@
|
||||
"The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
|
||||
"In this notebook, we will leverage:\n",
|
||||
"- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
|
||||
"- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n",
|
||||
"- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vector store.\n",
|
||||
"- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
|
||||
"\n",
|
||||
"Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."
|
||||
|
@ -206,7 +206,7 @@
|
||||
"source": [
|
||||
"Points to notice looking at the output chunks below:\n",
|
||||
"- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
|
||||
"- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
|
||||
"- Where needed, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
|
||||
"- Where possible, we merge undersized peer chunks (see chunk 0)\n",
|
||||
"- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)"
|
||||
]
|
||||
|
@ -279,7 +279,7 @@
|
||||
"## Use other vision models\n",
|
||||
"\n",
|
||||
"The examples above can also be reproduced using other vision model.\n",
|
||||
"The Docling options `PictureDescriptionVlmOptions` allows to speficy your favorite vision model from the Hugging Face Hub."
|
||||
"The Docling options `PictureDescriptionVlmOptions` allows to specify your favorite vision model from the Hugging Face Hub."
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -32,7 +32,7 @@
|
||||
"\n",
|
||||
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
|
||||
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
|
||||
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU.\n"
|
||||
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
|
@ -43,7 +43,7 @@
|
||||
"\n",
|
||||
"Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
|
||||
"1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
|
||||
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU."
|
||||
"2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU."
|
||||
]
|
||||
},
|
||||
{
|
||||
@ -716,7 +716,7 @@
|
||||
"id": "7tGz49nfUegG"
|
||||
},
|
||||
"source": [
|
||||
"We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documetation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
|
||||
"We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documentation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
|
||||
]
|
||||
}
|
||||
],
|
||||
|
@ -137,7 +137,7 @@ This is a collection of FAQ collected from the user questions on <https://github
|
||||
### Some images are missing from MS Word and Powerpoint
|
||||
|
||||
The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
|
||||
If you are on other operaring systems, these images will be ignored.
|
||||
If you are on other operating systems, these images will be ignored.
|
||||
|
||||
|
||||
??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"
|
||||
|
@ -37,7 +37,7 @@ docling ./input/dir --output ./scratch --abort-on-error
|
||||
|
||||
### Setting up a `DocumentConverter`
|
||||
|
||||
To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
|
||||
To accommodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
|
||||
You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options
|
||||
per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults
|
||||
will be used for all `allowed_formats`.
|
||||
@ -151,7 +151,7 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
|
||||
## Inspect the converted document:
|
||||
conv_result.document.print_element_tree()
|
||||
|
||||
## Iterate the elements in reading order, including hierachy level:
|
||||
## Iterate the elements in reading order, including hierarchy level:
|
||||
for item, level in conv_result.document.iterate_items():
|
||||
if isinstance(item, TextItem):
|
||||
print(item.text)
|
||||
|
Loading…
x
Reference in New Issue
Block a user