mirror of
				https://github.com/Unstructured-IO/unstructured.git
				synced 2025-11-04 03:53:45 +00:00 
			
		
		
		
	test: create CI pipelines for verifying base and extras pass respective tests (#1137)
**Summary** Closes #747 * Create CI Pipeline for running text, xml, email, and html doc tests against the library installed without extras * Create CI Pipeline for running each library extra against their respective tests
This commit is contained in:
		
							parent
							
								
									69edffb0c0
								
							
						
					
					
						commit
						e4aa7373e2
					
				
							
								
								
									
										81
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							
							
						
						
									
										81
									
								
								.github/workflows/ci.yml
									
									
									
									
										vendored
									
									
								
							@ -152,6 +152,87 @@ jobs:
 | 
			
		||||
        make test CI=true
 | 
			
		||||
        make check-coverage
 | 
			
		||||
 | 
			
		||||
  test_unit_no_extras:
 | 
			
		||||
    strategy:
 | 
			
		||||
      matrix:
 | 
			
		||||
        python-version: ["3.8"]
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    env:
 | 
			
		||||
      NLTK_DATA: ${{ github.workspace }}/nltk_data
 | 
			
		||||
    needs: [setup, lint]
 | 
			
		||||
    steps:
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
    - name: Set up Python ${{ matrix.python-version }}
 | 
			
		||||
      uses: actions/setup-python@v4
 | 
			
		||||
      with:
 | 
			
		||||
        python-version: ${{ matrix.python-version }}
 | 
			
		||||
    - uses: actions/cache/restore@v3
 | 
			
		||||
      id: virtualenv-cache
 | 
			
		||||
      with:
 | 
			
		||||
        path: |
 | 
			
		||||
          nltk_data
 | 
			
		||||
          .venv-base
 | 
			
		||||
        key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}-base
 | 
			
		||||
    - name: Setup virtual environment
 | 
			
		||||
      if: steps.virtualenv-cache.outputs.cache-hit != 'true'
 | 
			
		||||
      run: |
 | 
			
		||||
        python${{ matrix.python-version}} -m venv .venv-base
 | 
			
		||||
        source .venv-base/bin/activate
 | 
			
		||||
        mkdir "$NLTK_DATA"
 | 
			
		||||
        make install-base-ci
 | 
			
		||||
    - name: Test
 | 
			
		||||
      env:
 | 
			
		||||
        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
 | 
			
		||||
      run: |
 | 
			
		||||
        source .venv-base/bin/activate
 | 
			
		||||
        make test-no-extras CI=true
 | 
			
		||||
 | 
			
		||||
  test_unit_dependency_extras:
 | 
			
		||||
    # NOTE(newelh) - Split extras into separate steps in the same pipeline (avoid using matrix)
 | 
			
		||||
    strategy:
 | 
			
		||||
      matrix:
 | 
			
		||||
        python-version: ["3.8"]
 | 
			
		||||
        extra: ["csv", "docx", "odt", "markdown", "pypandoc", "msg", "pdf-image", "pptx", "xlsx"]
 | 
			
		||||
    runs-on: ubuntu-latest
 | 
			
		||||
    env:
 | 
			
		||||
      NLTK_DATA: ${{ github.workspace }}/nltk_data
 | 
			
		||||
    needs: [setup, lint, test_unit_no_extras]
 | 
			
		||||
    steps:
 | 
			
		||||
    - uses: actions/checkout@v3
 | 
			
		||||
    - name: Set up Python ${{ matrix.python-version }}
 | 
			
		||||
      uses: actions/setup-python@v4
 | 
			
		||||
      with:
 | 
			
		||||
        python-version: ${{ matrix.python-version }}
 | 
			
		||||
    - uses: actions/cache/restore@v3
 | 
			
		||||
      id: virtualenv-cache
 | 
			
		||||
      with:
 | 
			
		||||
        path: |
 | 
			
		||||
          nltk_data
 | 
			
		||||
        key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}-base
 | 
			
		||||
    - name: Setup virtual environment
 | 
			
		||||
      run: |
 | 
			
		||||
        python${{ matrix.python-version}} -m venv .venv-${{ matrix.extra }}
 | 
			
		||||
        source .venv-${{ matrix.extra }}/bin/activate
 | 
			
		||||
        make install-base-ci
 | 
			
		||||
        make install-${{ matrix.extra }}
 | 
			
		||||
    - name: Test
 | 
			
		||||
      env:
 | 
			
		||||
        UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
 | 
			
		||||
      run: |
 | 
			
		||||
        source .venv-${{ matrix.extra }}/bin/activate
 | 
			
		||||
        # NOTE(newelh) - determine what needs to be installed here
 | 
			
		||||
        sudo apt-get update
 | 
			
		||||
        sudo apt-get install -y libmagic-dev poppler-utils libreoffice
 | 
			
		||||
        make install-pandoc
 | 
			
		||||
        sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
 | 
			
		||||
        sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
 | 
			
		||||
        tesseract --version
 | 
			
		||||
        # NOTE(robinson) - Installing weaviate-client separately here because the requests
 | 
			
		||||
        # version conflicts with label_studio_sdk
 | 
			
		||||
        pip install weaviate-client
 | 
			
		||||
        pip install argilla
 | 
			
		||||
        make test-extra-${{ matrix.extra }} CI=true
 | 
			
		||||
 | 
			
		||||
  test_ingest:
 | 
			
		||||
    strategy:
 | 
			
		||||
      matrix:
 | 
			
		||||
 | 
			
		||||
@ -1,6 +1,11 @@
 | 
			
		||||
## 0.10.5-dev0
 | 
			
		||||
 | 
			
		||||
### Enhancements
 | 
			
		||||
* Create new CI Pipelines
 | 
			
		||||
  - Checking text, xml, email, and html doc tests against the library installed without extras
 | 
			
		||||
  - Checking each library extra against their respective tests
 | 
			
		||||
 | 
			
		||||
## 0.10.3
 | 
			
		||||
* Adds ability to reuse connections per process in unstructured-ingest
 | 
			
		||||
* Pass ocr_mode in partition_pdf and set the default back to individual pages for now
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
							
								
								
									
										62
									
								
								Makefile
									
									
									
									
									
								
							
							
						
						
									
										62
									
								
								Makefile
									
									
									
									
									
								
							@ -23,6 +23,9 @@ install: install-base-pip-packages install-dev install-nltk-models install-test
 | 
			
		||||
.PHONY: install-ci
 | 
			
		||||
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test
 | 
			
		||||
 | 
			
		||||
.PHONY: install-base-ci
 | 
			
		||||
install-base-ci: install-base-pip-packages install-nltk-models install-test
 | 
			
		||||
 | 
			
		||||
.PHONY: install-base-pip-packages
 | 
			
		||||
install-base-pip-packages:
 | 
			
		||||
	python3 -m pip install pip==${PIP_VERSION}
 | 
			
		||||
@ -253,6 +256,65 @@ test:
 | 
			
		||||
test-unstructured-api-unit:
 | 
			
		||||
	scripts/test-unstructured-api-unit.sh
 | 
			
		||||
 | 
			
		||||
.PHONY: test-no-extras
 | 
			
		||||
# TODO(newelh) Add json test when fixed
 | 
			
		||||
test-no-extras:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/test_text.py \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/test_email.py \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/test_html_partition.py \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/test_xml_partition.py 
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-csv
 | 
			
		||||
test-extra-csv:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/csv
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-docx
 | 
			
		||||
test-extra-docx:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/docx
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-markdown
 | 
			
		||||
test-extra-markdown:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/markdown
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-msg
 | 
			
		||||
test-extra-msg:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/msg
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-odt
 | 
			
		||||
test-extra-odt:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/odt
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-pdf-image
 | 
			
		||||
test-extra-pdf-image:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/pdf-image
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-pptx
 | 
			
		||||
test-extra-pptx:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/pptx
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-epub
 | 
			
		||||
test-extra-pypandoc:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/epub
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-pypandoc
 | 
			
		||||
test-extra-pypandoc:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/pypandoc
 | 
			
		||||
 | 
			
		||||
.PHONY: test-extra-xlsx
 | 
			
		||||
test-extra-xlsx:
 | 
			
		||||
	PYTHONPATH=. CI=$(CI) pytest \
 | 
			
		||||
		test_${PACKAGE_NAME}/partition/xlsx
 | 
			
		||||
 | 
			
		||||
## check:                   runs linters (includes tests)
 | 
			
		||||
.PHONY: check
 | 
			
		||||
check: check-src check-tests check-version
 | 
			
		||||
 | 
			
		||||
@ -19,7 +19,7 @@ expected_sections = {
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_epub_from_filename():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    elements = partition_epub(filename=filename)
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
    assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
 | 
			
		||||
@ -32,7 +32,7 @@ def test_partition_epub_from_filename():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_epub_from_filename_with_metadata_filename():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    elements = partition_epub(filename=filename, metadata_filename="test")
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
    assert all(element.metadata.filename == "test" for element in elements)
 | 
			
		||||
@ -40,7 +40,7 @@ def test_partition_epub_from_filename_with_metadata_filename():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_epub_from_file():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        elements = partition_epub(file=f)
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
@ -53,7 +53,7 @@ def test_partition_epub_from_file():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_epub_from_file_with_metadata_filename():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        elements = partition_epub(file=f, metadata_filename="test")
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
@ -62,7 +62,7 @@ def test_partition_epub_from_file_with_metadata_filename():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_epub_from_filename_exclude_metadata():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    elements = partition_epub(filename=filename, include_metadata=False)
 | 
			
		||||
    assert elements[0].metadata.filetype is None
 | 
			
		||||
    assert elements[0].metadata.page_name is None
 | 
			
		||||
@ -71,7 +71,7 @@ def test_partition_epub_from_filename_exclude_metadata():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_epub_from_file_exlcude_metadata():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        elements = partition_epub(file=f, include_metadata=False)
 | 
			
		||||
    assert elements[0].metadata.filetype is None
 | 
			
		||||
@ -11,7 +11,7 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_filename():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    elements = partition_md(filename=filename)
 | 
			
		||||
    assert "PageBreak" not in [elem.category for elem in elements]
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
@ -20,7 +20,7 @@ def test_partition_md_from_filename():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_filename_with_metadata_filename():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    elements = partition_md(filename=filename, metadata_filename="test")
 | 
			
		||||
    assert "PageBreak" not in [elem.category for elem in elements]
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
@ -29,7 +29,7 @@ def test_partition_md_from_filename_with_metadata_filename():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_file():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        elements = partition_md(file=f)
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
@ -38,7 +38,7 @@ def test_partition_md_from_file():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_file_with_metadata_filename():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        elements = partition_md(file=f, metadata_filename="test")
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
@ -46,7 +46,7 @@ def test_partition_md_from_file_with_metadata_filename():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_text():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        text = f.read()
 | 
			
		||||
    elements = partition_md(text=text)
 | 
			
		||||
@ -64,7 +64,7 @@ class MockResponse:
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_url():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        text = f.read()
 | 
			
		||||
 | 
			
		||||
@ -82,7 +82,7 @@ def test_partition_md_from_url():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_url_raises_with_bad_status_code():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        text = f.read()
 | 
			
		||||
 | 
			
		||||
@ -97,7 +97,7 @@ def test_partition_md_from_url_raises_with_bad_status_code():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_url_raises_with_bad_content_type():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        text = f.read()
 | 
			
		||||
 | 
			
		||||
@ -117,7 +117,7 @@ def test_partition_md_raises_with_none_specified():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_raises_with_too_many_specified():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        text = f.read()
 | 
			
		||||
 | 
			
		||||
@ -126,14 +126,14 @@ def test_partition_md_raises_with_too_many_specified():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_filename_exclude_metadata():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    elements = partition_md(filename=filename, include_metadata=False)
 | 
			
		||||
    for i in range(len(elements)):
 | 
			
		||||
        assert elements[i].metadata.to_dict() == {}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_file_exclude_metadata():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        elements = partition_md(file=f, include_metadata=False)
 | 
			
		||||
    for i in range(len(elements)):
 | 
			
		||||
@ -141,7 +141,7 @@ def test_partition_md_from_file_exclude_metadata():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_md_from_text_exclude_metadata():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
 | 
			
		||||
    with open(filename) as f:
 | 
			
		||||
        text = f.read()
 | 
			
		||||
    elements = partition_md(text=text, include_metadata=False)
 | 
			
		||||
@ -14,7 +14,7 @@ from unstructured.partition.msg import extract_msg_attachment_info, partition_ms
 | 
			
		||||
from unstructured.partition.text import partition_text
 | 
			
		||||
 | 
			
		||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | 
			
		||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
 | 
			
		||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
 | 
			
		||||
 | 
			
		||||
EXPECTED_MSG_OUTPUT = [
 | 
			
		||||
    NarrativeText(text="This is a test email to use for unit tests."),
 | 
			
		||||
@ -104,6 +104,7 @@ def test_extract_attachment_info():
 | 
			
		||||
        DIRECTORY,
 | 
			
		||||
        "..",
 | 
			
		||||
        "..",
 | 
			
		||||
        "..",
 | 
			
		||||
        "example-docs",
 | 
			
		||||
        "fake-email-attachment.msg",
 | 
			
		||||
    )
 | 
			
		||||
@ -5,7 +5,7 @@ from unstructured.documents.elements import Title
 | 
			
		||||
from unstructured.partition.odt import partition_odt
 | 
			
		||||
 | 
			
		||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | 
			
		||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
 | 
			
		||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_odt_from_filename():
 | 
			
		||||
@ -177,6 +177,7 @@ def test_partition_image_with_ocr_detects_korean():
 | 
			
		||||
        DIRECTORY,
 | 
			
		||||
        "..",
 | 
			
		||||
        "..",
 | 
			
		||||
        "..",
 | 
			
		||||
        "example-docs",
 | 
			
		||||
        "english-and-korean.png",
 | 
			
		||||
    )
 | 
			
		||||
@ -191,7 +192,7 @@ def test_partition_image_with_ocr_detects_korean():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_image_with_ocr_detects_korean_from_file():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "english-and-korean.png")
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        elements = image.partition_image(
 | 
			
		||||
            file=f,
 | 
			
		||||
@ -208,6 +209,7 @@ def test_partition_image_raises_with_bad_strategy():
 | 
			
		||||
        DIRECTORY,
 | 
			
		||||
        "..",
 | 
			
		||||
        "..",
 | 
			
		||||
        "..",
 | 
			
		||||
        "example-docs",
 | 
			
		||||
        "english-and-korean.png",
 | 
			
		||||
    )
 | 
			
		||||
@ -216,7 +218,14 @@ def test_partition_image_raises_with_bad_strategy():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_image_default_strategy_hi_res():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg")
 | 
			
		||||
    filename = os.path.join(
 | 
			
		||||
        DIRECTORY,
 | 
			
		||||
        "..",
 | 
			
		||||
        "..",
 | 
			
		||||
        "..",
 | 
			
		||||
        "example-docs",
 | 
			
		||||
        "layout-parser-paper-fast.jpg",
 | 
			
		||||
    )
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        elements = image.partition_image(file=f)
 | 
			
		||||
 | 
			
		||||
@ -7,7 +7,7 @@ from unstructured.documents.elements import ListItem, NarrativeText, Title
 | 
			
		||||
from unstructured.partition.ppt import partition_ppt
 | 
			
		||||
 | 
			
		||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | 
			
		||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
 | 
			
		||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
 | 
			
		||||
 | 
			
		||||
EXPECTED_PPT_OUTPUT = [
 | 
			
		||||
    Title(text="Adding a Bullet Slide"),
 | 
			
		||||
@ -14,7 +14,7 @@ from unstructured.documents.elements import (
 | 
			
		||||
from unstructured.partition.pptx import partition_pptx
 | 
			
		||||
 | 
			
		||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | 
			
		||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
 | 
			
		||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
 | 
			
		||||
 | 
			
		||||
EXPECTED_PPTX_OUTPUT = [
 | 
			
		||||
    Title(text="Adding a Bullet Slide"),
 | 
			
		||||
@ -8,7 +8,7 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve()
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_rtf_from_filename():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    elements = partition_rtf(filename=filename)
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
    assert elements[0] == Title("My First Heading")
 | 
			
		||||
@ -17,14 +17,14 @@ def test_partition_rtf_from_filename():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_rtf_from_filename_with_metadata_filename():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    elements = partition_rtf(filename=filename, metadata_filename="test")
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
    assert all(element.metadata.filename == "test" for element in elements)
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_rtf_from_file():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        elements = partition_rtf(file=f)
 | 
			
		||||
    assert len(elements) > 0
 | 
			
		||||
@ -34,7 +34,7 @@ def test_partition_rtf_from_file():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_rtf_from_file_with_metadata_filename():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        elements = partition_rtf(file=f, metadata_filename="test")
 | 
			
		||||
    assert elements[0] == Title("My First Heading")
 | 
			
		||||
@ -43,14 +43,14 @@ def test_partition_rtf_from_file_with_metadata_filename():
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_rtf_from_filename_exclude_metadata():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    elements = partition_rtf(filename=filename, include_metadata=False)
 | 
			
		||||
    for i in range(len(elements)):
 | 
			
		||||
        assert elements[i].metadata.to_dict() == {}
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def test_partition_rtf_from_file_exclude_metadata():
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
 | 
			
		||||
    with open(filename, "rb") as f:
 | 
			
		||||
        elements = partition_rtf(file=f, include_metadata=False)
 | 
			
		||||
    for i in range(len(elements)):
 | 
			
		||||
@ -25,8 +25,8 @@ from unstructured.logger import logger
 | 
			
		||||
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
 | 
			
		||||
from unstructured.utils import dependency_exists
 | 
			
		||||
 | 
			
		||||
if dependency_exists("docx"):
 | 
			
		||||
    import docx.table as docxtable
 | 
			
		||||
if dependency_exists("docx") and dependency_exists("docx.table"):
 | 
			
		||||
    from docx.table import Table as docxtable
 | 
			
		||||
 | 
			
		||||
if TYPE_CHECKING:
 | 
			
		||||
    from unstructured_inference.inference.layoutelement import (
 | 
			
		||||
 | 
			
		||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user