chore: typo fix (#1465)

* typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> * chore: typo fix Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com> --------- Signed-off-by: nkh0472 <67589323+nkh0472@users.noreply.github.com>
2025-12-25 17:12:16 +00:00 · 2025-04-28 14:52:09 +08:00 · 2025-04-28 14:52:09 +08:00 · a097ccd8d5
commit a097ccd8d5
parent 3afbe6c969
14 changed files with 19 additions and 19 deletions
--- a/docling/backend/md_backend.py
+++ b/docling/backend/md_backend.py
@ -409,7 +409,7 @@ class MarkdownDocumentBackend(DeclarativeDocumentBackend):
                        )
                    return _txt

-                # restore original HTML by removing previouly added markers
+                # restore original HTML by removing previously added markers
                for regex in [
                    rf"<pre>\s*<code>\s*{_START_MARKER}",
                    rf"{_STOP_MARKER}\s*</code>\s*</pre>",
--- a/docling/backend/msword_backend.py
+++ b/docling/backend/msword_backend.py
@ -436,7 +436,7 @@ class MsWordDocumentBackend(DeclarativeDocumentBackend):

        # Common styles for bullet and numbered lists.
        # "List Bullet", "List Number", "List Paragraph"
-        # Identify wether list is a numbered list or not
+        # Identify whether list is a numbered list or not
        # is_numbered = "List Bullet" not in paragraph.style.name
        is_numbered = False
        p_style_id, p_level = self._get_label_and_level(paragraph)
--- a/docling/backend/xml/jats_backend.py
+++ b/docling/backend/xml/jats_backend.py
@ -91,7 +91,7 @@ class JatsDocumentBackend(DeclarativeDocumentBackend):
        super().__init__(in_doc, path_or_stream)
        self.path_or_stream = path_or_stream

-        # Initialize the root of the document hiearchy
+        # Initialize the root of the document hierarchy
        self.root: Optional[NodeItem] = None

        self.valid = False
--- a/docling/backend/xml/uspto_backend.py
+++ b/docling/backend/xml/uspto_backend.py
@ -1,6 +1,6 @@
 """Backend to parse patents from the United States Patent Office (USPTO).

-The parsers included in this module can handle patent grants pubished since 1976 and
+The parsers included in this module can handle patent grants published since 1976 and
 patent applications since 2001.
 The original files can be found in https://bulkdata.uspto.gov.
 """
@ -440,7 +440,7 @@ class PatentUsptoIce(PatentUspto):
                    )

            elif name == self.Element.PARAGRAPH.value and text:
-                # remmove blank spaces added in paragraphs
+                # remove blank spaces added in paragraphs
                text = re.sub("\\s+", " ", text)
                if self.Element.ABSTRACT.value in self.property:
                    self.abstract = (
@ -1697,7 +1697,7 @@ class XmlTable:
 class HtmlEntity:
    """Provide utility functions to get the HTML entities of styled characters.

-    This class has been developped from:
+    This class has been developed from:
    https://unicode-table.com/en/html-entities/
    https://www.w3.org/TR/WD-math-970515/table03.html
    """
@ -1896,7 +1896,7 @@ class HtmlEntity:
        """Get an HTML entity of a greek letter in ISO 8879.

        Args:
-            The text to transform, as an ISO 8879 entitiy.
+            The text to transform, as an ISO 8879 entity.

        Returns:
            The HTML entity representing a greek letter. If the input text is not
--- a/docling/cli/main.py
+++ b/docling/cli/main.py
@ -521,7 +521,7 @@ def convert(  # noqa: C901
            if image_export_mode != ImageRefMode.PLACEHOLDER:
                pipeline_options.generate_page_images = True
                pipeline_options.generate_picture_images = (
-                    True  # FIXME: to be deprecated in verson 3
+                    True  # FIXME: to be deprecated in version 3
                )
                pipeline_options.images_scale = 2

--- a/docling/models/table_structure_model.py
+++ b/docling/models/table_structure_model.py
@ -234,7 +234,7 @@ class TableStructureModel(BasePageModel):
                                tcells = table_cluster.cells
                            tokens = []
                            for c in tcells:
-                                # Only allow non empty stings (spaces) into the cells of a table
+                                # Only allow non empty strings (spaces) into the cells of a table
                                if len(c.text.strip()) > 0:
                                    new_cell = copy.deepcopy(c)
                                    new_cell.rect = BoundingRectangle.from_bounding_box(
--- a/docling/models/tesseract_ocr_model.py
+++ b/docling/models/tesseract_ocr_model.py
@ -151,7 +151,7 @@ class TesseractOcrModel(BaseOcrModel):
                            script = map_tesseract_script(script)
                            lang = f"{self.script_prefix}{script}"

-                            # Check if the detected languge is present in the system
+                            # Check if the detected language is present in the system
                            if lang not in self._tesserocr_languages:
                                msg = f"Tesseract detected the script '{script}' and language '{lang}'."
                                msg += " However this language is not installed in your system and will be ignored."
--- a/docs/examples/backend_xml_rag.ipynb
+++ b/docs/examples/backend_xml_rag.ipynb
@ -569,7 +569,7 @@
    "The `DoclingDocument` format of the converted patents has a rich hierarchical structure, inherited from the original XML document and preserved by the Docling custom backend.\n",
    "In this notebook, we will leverage:\n",
    "- The `SimpleDirectoryReader` pattern to iterate over the exported XML files created in section [Fetch the data](#fetch-the-data).\n",
-    "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vectore store.\n",
+    "- The LlamaIndex extensions, `DoclingReader` and `DoclingNodeParser`, to ingest the patent chunks into a Milvus vector store.\n",
    "- The `HierarchicalChunker` implementation, which applies a document-based hierarchical chunking, to leverage the patent structures like sections and paragraphs within sections.\n",
    "\n",
    "Refer to other possible implementations and usage patterns in the [Chunking](../../concepts/chunking/) documentation and the [RAG with LlamaIndex](../rag_llamaindex/) notebook."
--- a/docs/examples/hybrid_chunking.ipynb
+++ b/docs/examples/hybrid_chunking.ipynb
@ -206,7 +206,7 @@
   "source": [
    "Points to notice looking at the output chunks below:\n",
    "- Where possible, we fit the limit of 64 tokens for the metadata-enriched serialization form (see chunk 2)\n",
-    "- Where neeeded, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
+    "- Where needed, we stop before the limit, e.g. see cases of 63 as it would otherwise run into a comma (see chunk 6)\n",
    "- Where possible, we merge undersized peer chunks (see chunk 0)\n",
    "- \"Tail\" chunks trailing right after merges may still be undersized (see chunk 8)"
   ]
--- a/docs/examples/pictures_description.ipynb
+++ b/docs/examples/pictures_description.ipynb
@ -279,7 +279,7 @@
    "## Use other vision models\n",
    "\n",
    "The examples above can also be reproduced using other vision model.\n",
-    "The Docling options `PictureDescriptionVlmOptions` allows to speficy your favorite vision model from the Hugging Face Hub."
+    "The Docling options `PictureDescriptionVlmOptions` allows to specify your favorite vision model from the Hugging Face Hub."
   ]
  },
  {
--- a/docs/examples/rag_milvus.ipynb
+++ b/docs/examples/rag_milvus.ipynb
@ -32,7 +32,7 @@
    "\n",
    "Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
    "1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
-    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU.\n"
+    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU.\n"
   ]
  },
  {
--- a/docs/examples/rag_weaviate.ipynb
+++ b/docs/examples/rag_weaviate.ipynb
@ -43,7 +43,7 @@
    "\n",
    "Note: For best results, please use **GPU acceleration** to run this notebook. Here are two options for running this notebook:\n",
    "1. **Locally on a MacBook with an Apple Silicon chip.** Converting all documents in the notebook takes ~2 minutes on a MacBook M2 due to Docling's usage of MPS accelerators.\n",
-    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 mintutes on a Google Colab T4 GPU."
+    "2. **Run this notebook on Google Colab.** Converting all documents in the notebook takes ~8 minutes on a Google Colab T4 GPU."
   ]
  },
  {
@ -716,7 +716,7 @@
    "id": "7tGz49nfUegG"
   },
   "source": [
-    "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documetation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
+    "We can see that our RAG pipeline performs relatively well for simple queries, especially given the small size of the dataset. Scaling this method for converting a larger sample of PDFs would require more compute (GPUs) and a more advanced deployment of Weaviate (like Docker, Kubernetes, or Weaviate Cloud). For more information on available Weaviate configurations, check out the [documentation](https://weaviate.io/developers/weaviate/starter-guides/which-weaviate)."
   ]
  }
 ],
--- a/docs/faq/index.md
+++ b/docs/faq/index.md
@ -137,7 +137,7 @@ This is a collection of FAQ collected from the user questions on <https://github
    ### Some images are missing from MS Word and Powerpoint

    The image processing library used by Docling is able to handle embedded WMF images only on Windows platform.
-    If you are on other operaring systems, these images will be ignored.
+    If you are on other operating systems, these images will be ignored.


 ??? question "`HybridChunker` triggers warning: 'Token indices sequence length is longer than the specified maximum sequence length for this model'"
--- a/docs/v2.md
+++ b/docs/v2.md
@ -37,7 +37,7 @@ docling ./input/dir --output ./scratch --abort-on-error

 ### Setting up a `DocumentConverter`

-To accomodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
+To accommodate many input formats, we changed the way you need to set up your `DocumentConverter` object.
 You can now define a list of allowed formats on the `DocumentConverter` initialization, and specify custom options
 per-format if desired. By default, all supported formats are allowed. If you don't provide `format_options`, defaults
 will be used for all `allowed_formats`.
@ -151,7 +151,7 @@ conv_result: ConversionResult = doc_converter.convert("https://arxiv.org/pdf/240
 ## Inspect the converted document:
 conv_result.document.print_element_tree()

-## Iterate the elements in reading order, including hierachy level:
+## Iterate the elements in reading order, including hierarchy level:
 for item, level in conv_result.document.iterate_items():
    if isinstance(item, TextItem):
        print(item.text)