From 9062d25d0d5700a92159c9d91c014c86bfd25a4b Mon Sep 17 00:00:00 2001 From: Tom Aarsen <37621491+tomaarsen@users.noreply.github.com> Date: Sat, 25 Feb 2023 02:48:23 +0100 Subject: [PATCH] Resolve numerous typos (#280) * Resolve numerous typos * Resolve typo in mime type --- README.md | 2 +- docs/source/bricks.rst | 10 +++++----- docs/source/elements.rst | 2 +- examples/argilla-summarization/README.md | 2 +- examples/sec-sentiment-analysis/README.md | 2 +- examples/sec-sentiment-analysis/fetch.py | 4 ++-- examples/training/0-Core Concepts.ipynb | 2 +- examples/training/1-Intro to Bricks.ipynb | 4 ++-- examples/training/2-File Exploration.ipynb | 2 +- requirements/test.in | 2 +- unstructured/cleaners/core.py | 2 +- unstructured/cleaners/extract.py | 6 +++--- unstructured/cleaners/translate.py | 2 +- unstructured/documents/html.py | 2 +- unstructured/file_utils/filetype.py | 2 +- unstructured/ingest/interfaces.py | 2 +- unstructured/partition/docx.py | 2 +- unstructured/partition/text_type.py | 2 +- unstructured/staging/datasaur.py | 2 +- 19 files changed, 27 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index 731442f41..2e4861591 100644 --- a/README.md +++ b/README.md @@ -228,7 +228,7 @@ The output will look the same as the example from the document parsing section a ### E-mail Parsing The `partition_email` function within `unstructured` is helpful for parsing `.eml` files. Common -e-mail clients such as Microsoft Outlook and Gmail support exproting e-mails as `.eml` files. +e-mail clients such as Microsoft Outlook and Gmail support exporting e-mails as `.eml` files. `partition_email` accepts filenames, file-like object, and raw text as input. The following three snippets for parsing `.eml` files are equivalent: diff --git a/docs/source/bricks.rst b/docs/source/bricks.rst index 547b23c20..b42a943b9 100644 --- a/docs/source/bricks.rst +++ b/docs/source/bricks.rst @@ -20,7 +20,7 @@ titles, narrative text, and tables. The ``partition`` brick is the simplest way to partition a document in ``unstructured``. If you call the ``partition`` function, ``unstructured`` will attempt to detect the file type and route it to the appropriate partitioning brick. All partitioning bricks -called within ``partition`` are called using the defualt kwargs. Use the document-type +called within ``partition`` are called using the default kwargs. Use the document-type specific bricks if you need to apply non-default settings. ``partition`` currently supports ``.docx``, ``.doc``, ``.pptx``, ``.ppt``, ``.eml``, ``.html``, ``.pdf``, ``.png``, ``.jpg``, and ``.txt`` files. @@ -539,7 +539,7 @@ Examples: ``clean_ordered_bullets`` ------------------------- -Remove alpha-numeric bullets from the beginning of text up to three “sub-section” levels. +Remove alphanumeric bullets from the beginning of text up to three “sub-section” levels. Examples: @@ -687,7 +687,7 @@ Extracts text that occurs before the specified pattern. Options: -* If ``index`` is set, extract before the ``(index + 1)``th occurence of the pattern. The default is ``0``. +* If ``index`` is set, extract before the ``(index + 1)``th occurrence of the pattern. The default is ``0``. * Strips leading whitespace if ``strip`` is set to ``True``. The default is ``True``. @@ -710,7 +710,7 @@ Extracts text that occurs after the specified pattern. Options: -* If ``index`` is set, extract after the ``(index + 1)``th occurence of the pattern. The default is ``0``. +* If ``index`` is set, extract after the ``(index + 1)``th occurrence of the pattern. The default is ``0``. * Strips trailing whitespace if ``strip`` is set to ``True``. The default is ``True``. @@ -834,7 +834,7 @@ Examples: ``extract_ordered_bullets`` --------------------------- -Extracts alpha-numeric bullets from the beginning of text up to three “sub-section” levels. +Extracts alphanumeric bullets from the beginning of text up to three “sub-section” levels. Examples: diff --git a/docs/source/elements.rst b/docs/source/elements.rst index 68b1f80e6..ab8c299ea 100644 --- a/docs/source/elements.rst +++ b/docs/source/elements.rst @@ -2,7 +2,7 @@ Elements -------- The following are the structured page elements that are available within the ``unstructured`` -package. Partioning bricks convert raw documents to this common set of elements. If you need +package. Partitioning bricks convert raw documents to this common set of elements. If you need a custom element, the recommended approach is to create a sub-class of one of the default elements. diff --git a/examples/argilla-summarization/README.md b/examples/argilla-summarization/README.md index 6ac794cbe..3829d339e 100644 --- a/examples/argilla-summarization/README.md +++ b/examples/argilla-summarization/README.md @@ -8,7 +8,7 @@ complete a data science project in hours that previously would have taken weeks. To get started, use the following steps: - Ensure you have Python 3.8 or higher installed on your system -- Create a new Python virtual enviornment +- Create a new Python virtual environment - Run `pip install -r requirements.txt` to install the dependencies - Run `PYTHONPATH=. jupyter notebook` from this directory to launch the notebook diff --git a/examples/sec-sentiment-analysis/README.md b/examples/sec-sentiment-analysis/README.md index 656b18266..2d296251f 100644 --- a/examples/sec-sentiment-analysis/README.md +++ b/examples/sec-sentiment-analysis/README.md @@ -5,7 +5,7 @@ and several bricks from the `unstructured` library to train a sentiment analysis risk factors section of S-1 filings. To get started, use the following steps: - Ensure you have Python 3.8 or higher installed on your system -- Create a new Python virtual enviornment +- Create a new Python virtual environment - Run `pip install -r requirements.txt` to install the dependencies - Run `PYTHONPATH=. jupyter notebook` from this directory to launch the notebook diff --git a/examples/sec-sentiment-analysis/fetch.py b/examples/sec-sentiment-analysis/fetch.py index 203e1d3da..e441d2bba 100644 --- a/examples/sec-sentiment-analysis/fetch.py +++ b/examples/sec-sentiment-analysis/fetch.py @@ -125,7 +125,7 @@ def get_form_by_ticker( def _form_types(form_type: str, allow_amended_filing: Optional[bool] = True): - """Potentialy expand to include amended filing, e.g.: + """Potentially expand to include amended filing, e.g.: "10-Q" -> "10-Q/A" """ assert form_type in VALID_FILING_TYPES @@ -144,7 +144,7 @@ def get_form_by_cik( ) -> str: """For a given CIK, returns the most recent form of a given form_type. By default an amended version of the form_type may be retrieved (allow_amended_filing=True). - E.g., if form_type is "10-Q", the retrived form could be a 10-Q or 10-Q/A. + E.g., if form_type is "10-Q", the retrieved form could be a 10-Q or 10-Q/A. """ session = _get_session(company, email) acc_num, _ = _get_recent_acc_num_by_cik( diff --git a/examples/training/0-Core Concepts.ipynb b/examples/training/0-Core Concepts.ipynb index 8a4480756..56bb14d44 100644 --- a/examples/training/0-Core Concepts.ipynb +++ b/examples/training/0-Core Concepts.ipynb @@ -187,7 +187,7 @@ " - `Image`\n", " - `PageBreak`\n", " \n", - "Other element types that we will add in the future include tables and figures. Different partioning functions use different methods for determining the element type and extracting the associated content. Document elements have a `str` representation. You can print them using the snippet below." + "Other element types that we will add in the future include tables and figures. Different partitioning functions use different methods for determining the element type and extracting the associated content. Document elements have a `str` representation. You can print them using the snippet below." ] }, { diff --git a/examples/training/1-Intro to Bricks.ipynb b/examples/training/1-Intro to Bricks.ipynb index 632de6477..38de55ba5 100644 --- a/examples/training/1-Intro to Bricks.ipynb +++ b/examples/training/1-Intro to Bricks.ipynb @@ -143,7 +143,7 @@ "id": "e3a8e7f4", "metadata": {}, "source": [ - "The `unstructured` library also includes partitioning bricks targeted at specific document types. The `partition` brick uses these document-specific partitioning bricks under the hood. There are a few reasons you may want to use a document-specific partioning brick instead of `partition`:\n", + "The `unstructured` library also includes partitioning bricks targeted at specific document types. The `partition` brick uses these document-specific partitioning bricks under the hood. There are a few reasons you may want to use a document-specific partitioning brick instead of `partition`:\n", "\n", "1. If you already know the document type, filetype detection is unnecessary. Using the document-specific brick directly will make your program run faster.\n", "2. Fewer dependencies. You don't need to install `libmagic` for filetype detection if you're only using document-specific bricks.\n", @@ -312,7 +312,7 @@ "id": "358e149b", "metadata": {}, "source": [ - "Since a cleaning brick is just a `str -> str` function, users can also easily include their own cleaning bricks for custom data preparation tasks. In the example below, we partition a Russian offensive campaign assessment from the institute of the study of war and remove citations, which are not natural language text that we want to inclue for model training purposes." + "Since a cleaning brick is just a `str -> str` function, users can also easily include their own cleaning bricks for custom data preparation tasks. In the example below, we partition a Russian offensive campaign assessment from the institute of the study of war and remove citations, which are not natural language text that we want to include for model training purposes." ] }, { diff --git a/examples/training/2-File Exploration.ipynb b/examples/training/2-File Exploration.ipynb index a0bcd95c2..bc9097e80 100644 --- a/examples/training/2-File Exploration.ipynb +++ b/examples/training/2-File Exploration.ipynb @@ -7,7 +7,7 @@ "source": [ "# File Exploration\n", "\n", - "In addition to core document processing capabilities, the `unstructured` library includes utilities for summarizing information about raw doucments. We will cover how to use these utilities in this notebook. At the conclusion of this notebook, you should understand:\n", + "In addition to core document processing capabilities, the `unstructured` library includes utilities for summarizing information about raw documents. We will cover how to use these utilities in this notebook. At the conclusion of this notebook, you should understand:\n", "\n", "- [Filetype detection in `unstructured`](#filetype)\n", "- [How to generate summary statistics about documents](#summary)" diff --git a/requirements/test.in b/requirements/test.in index 559789857..3d8499149 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -15,5 +15,5 @@ types-requests vcrpy # NOTE(robinson) - The following pins are to address -# vulernabilities in dependency scans +# vulnerabilities in dependency scans certifi>=2022.12.07 diff --git a/unstructured/cleaners/core.py b/unstructured/cleaners/core.py index 5b2c76bc0..8c6656086 100644 --- a/unstructured/cleaners/core.py +++ b/unstructured/cleaners/core.py @@ -23,7 +23,7 @@ def clean_bullets(text) -> str: def clean_ordered_bullets(text) -> str: """Cleans the start of bulleted text sections up to three “sub-section” - bullets accounting numeric and alpha-numeric types. + bullets accounting numeric and alphanumeric types. Example ------- diff --git a/unstructured/cleaners/extract.py b/unstructured/cleaners/extract.py index b17a2be0d..f13f3581f 100644 --- a/unstructured/cleaners/extract.py +++ b/unstructured/cleaners/extract.py @@ -29,7 +29,7 @@ def _get_indexed_match(text: str, pattern: str, index: int = 0) -> re.Match: def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = True) -> str: """Extracts texts that occurs before the specified pattern. By default, it will use - the first occurence of the pattern (index 0). Use the index kwarg to choose a different + the first occurrence of the pattern (index 0). Use the index kwarg to choose a different index. Input @@ -44,7 +44,7 @@ def extract_text_before(text: str, pattern: str, index: int = 0, strip: bool = T def extract_text_after(text: str, pattern: str, index: int = 0, strip: bool = True) -> str: """Extracts texts that occurs before the specified pattern. By default, it will use - the first occurence of the pattern (index 0). Use the index kwarg to choose a different + the first occurrence of the pattern (index 0). Use the index kwarg to choose a different index. Input @@ -99,7 +99,7 @@ def extract_us_phone_number(text: str): def extract_ordered_bullets(text) -> tuple: """Extracts the start of bulleted text sections bullets - accounting numeric and alpha-numeric types. + accounting numeric and alphanumeric types. Output ----- diff --git a/unstructured/cleaners/translate.py b/unstructured/cleaners/translate.py index f2c0292d1..e7e5ce49a 100644 --- a/unstructured/cleaners/translate.py +++ b/unstructured/cleaners/translate.py @@ -59,7 +59,7 @@ def translate_text(text, source_lang: Optional[str] = None, target_lang: str = " except OSError: raise ValueError( f"Transformers could not find the translation model {model_name}. " - "The requested source/target language combo is not suppored." + "The requested source/target language combo is not supported." ) chunks: List[str] = chunk_by_attention_window(text, tokenizer, split_function=sent_tokenize) diff --git a/unstructured/documents/html.py b/unstructured/documents/html.py index 04dcf4f48..7d2491f14 100644 --- a/unstructured/documents/html.py +++ b/unstructured/documents/html.py @@ -230,7 +230,7 @@ def _text_to_element(text: str, tag: str, ancestortags: Tuple[str, ...]) -> Opti def _is_container_with_text(tag_elem: etree.Element) -> bool: - """Checks if a tag is a container that also happens to containe text. + """Checks if a tag is a container that also happens to contain text. Example -------
Hi there, diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 68909f274..2127451b9 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -236,7 +236,7 @@ def _detect_filetype_from_octet_stream(file: IO) -> FileType: elif all([f in archive_filenames for f in EXPECTED_PPTX_FILES]): return FileType.PPTX - logger.warning("Could not detect the filetype from application/octet-strem MIME type.") + logger.warning("Could not detect the filetype from application/octet-stream MIME type.") return FileType.UNK diff --git a/unstructured/ingest/interfaces.py b/unstructured/ingest/interfaces.py index 5b434016d..caed910f4 100644 --- a/unstructured/ingest/interfaces.py +++ b/unstructured/ingest/interfaces.py @@ -16,7 +16,7 @@ class BaseConnector(ABC): @abstractmethod def cleanup(self, cur_dir=None): - """Any additonal cleanup up need after processing is complete. E.g., removing + """Any additional cleanup up need after processing is complete. E.g., removing temporary download dirs that are empty. By convention, documents that failed to process are typically not cleaned up.""" diff --git a/unstructured/partition/docx.py b/unstructured/partition/docx.py index 6902a4eb8..a97ce2a4e 100644 --- a/unstructured/partition/docx.py +++ b/unstructured/partition/docx.py @@ -98,7 +98,7 @@ def partition_docx( def _paragraph_to_element(paragraph: docx.text.paragraph.Paragraph) -> Optional[Text]: """Converts a docx Paragraph object into the appropriate unstructured document element. - If the paragaraph style is "Normal" or unknown, we try to predict the element type from the + If the paragraph style is "Normal" or unknown, we try to predict the element type from the raw text.""" text = paragraph.text style_name = paragraph.style.name diff --git a/unstructured/partition/text_type.py b/unstructured/partition/text_type.py index e623a1525..30bba7892 100644 --- a/unstructured/partition/text_type.py +++ b/unstructured/partition/text_type.py @@ -228,7 +228,7 @@ def under_non_alpha_ratio(text: str, threshold: float = 0.5): def exceeds_cap_ratio(text: str, threshold: float = 0.5) -> bool: """Checks the title ratio in a section of text. If a sufficient proportion of the words - are capitalized, that can be indiciated on non-narrative text (i.e. "1A. Risk Factors"). + are capitalized, that can be indicated on non-narrative text (i.e. "1A. Risk Factors"). Parameters ---------- diff --git a/unstructured/staging/datasaur.py b/unstructured/staging/datasaur.py index ac8f1bf2c..8665a3c97 100644 --- a/unstructured/staging/datasaur.py +++ b/unstructured/staging/datasaur.py @@ -12,7 +12,7 @@ def stage_for_datasaur( _entities: List[List[Dict[str, Any]]] = [[] for _ in range(len(elements))] if entities is not None: if len(entities) != len(elements): - raise ValueError("If entities is specified, it must be the same lenth as elements.") + raise ValueError("If entities is specified, it must be the same length as elements.") for entity_list in entities: for entity in entity_list: