mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
test: create CI pipelines for verifying base and extras pass respective tests (#1137)
**Summary** Closes #747 * Create CI Pipeline for running text, xml, email, and html doc tests against the library installed without extras * Create CI Pipeline for running each library extra against their respective tests
This commit is contained in:
parent
69edffb0c0
commit
e4aa7373e2
81
.github/workflows/ci.yml
vendored
81
.github/workflows/ci.yml
vendored
@ -152,6 +152,87 @@ jobs:
|
||||
make test CI=true
|
||||
make check-coverage
|
||||
|
||||
test_unit_no_extras:
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.8"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup, lint]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- uses: actions/cache/restore@v3
|
||||
id: virtualenv-cache
|
||||
with:
|
||||
path: |
|
||||
nltk_data
|
||||
.venv-base
|
||||
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}-base
|
||||
- name: Setup virtual environment
|
||||
if: steps.virtualenv-cache.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python${{ matrix.python-version}} -m venv .venv-base
|
||||
source .venv-base/bin/activate
|
||||
mkdir "$NLTK_DATA"
|
||||
make install-base-ci
|
||||
- name: Test
|
||||
env:
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
run: |
|
||||
source .venv-base/bin/activate
|
||||
make test-no-extras CI=true
|
||||
|
||||
test_unit_dependency_extras:
|
||||
# NOTE(newelh) - Split extras into separate steps in the same pipeline (avoid using matrix)
|
||||
strategy:
|
||||
matrix:
|
||||
python-version: ["3.8"]
|
||||
extra: ["csv", "docx", "odt", "markdown", "pypandoc", "msg", "pdf-image", "pptx", "xlsx"]
|
||||
runs-on: ubuntu-latest
|
||||
env:
|
||||
NLTK_DATA: ${{ github.workspace }}/nltk_data
|
||||
needs: [setup, lint, test_unit_no_extras]
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Set up Python ${{ matrix.python-version }}
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: ${{ matrix.python-version }}
|
||||
- uses: actions/cache/restore@v3
|
||||
id: virtualenv-cache
|
||||
with:
|
||||
path: |
|
||||
nltk_data
|
||||
key: unstructured-${{ runner.os }}-${{ matrix.python-version }}-${{ env.GHA_CACHE_KEY_VERSION }}-${{ hashFiles('requirements/*.txt') }}-base
|
||||
- name: Setup virtual environment
|
||||
run: |
|
||||
python${{ matrix.python-version}} -m venv .venv-${{ matrix.extra }}
|
||||
source .venv-${{ matrix.extra }}/bin/activate
|
||||
make install-base-ci
|
||||
make install-${{ matrix.extra }}
|
||||
- name: Test
|
||||
env:
|
||||
UNS_API_KEY: ${{ secrets.UNS_API_KEY }}
|
||||
run: |
|
||||
source .venv-${{ matrix.extra }}/bin/activate
|
||||
# NOTE(newelh) - determine what needs to be installed here
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||
make install-pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
# NOTE(robinson) - Installing weaviate-client separately here because the requests
|
||||
# version conflicts with label_studio_sdk
|
||||
pip install weaviate-client
|
||||
pip install argilla
|
||||
make test-extra-${{ matrix.extra }} CI=true
|
||||
|
||||
test_ingest:
|
||||
strategy:
|
||||
matrix:
|
||||
|
@ -1,6 +1,11 @@
|
||||
## 0.10.5-dev0
|
||||
|
||||
### Enhancements
|
||||
* Create new CI Pipelines
|
||||
- Checking text, xml, email, and html doc tests against the library installed without extras
|
||||
- Checking each library extra against their respective tests
|
||||
|
||||
## 0.10.3
|
||||
* Adds ability to reuse connections per process in unstructured-ingest
|
||||
* Pass ocr_mode in partition_pdf and set the default back to individual pages for now
|
||||
|
||||
|
62
Makefile
62
Makefile
@ -23,6 +23,9 @@ install: install-base-pip-packages install-dev install-nltk-models install-test
|
||||
.PHONY: install-ci
|
||||
install-ci: install-base-pip-packages install-nltk-models install-huggingface install-all-docs install-test
|
||||
|
||||
.PHONY: install-base-ci
|
||||
install-base-ci: install-base-pip-packages install-nltk-models install-test
|
||||
|
||||
.PHONY: install-base-pip-packages
|
||||
install-base-pip-packages:
|
||||
python3 -m pip install pip==${PIP_VERSION}
|
||||
@ -253,6 +256,65 @@ test:
|
||||
test-unstructured-api-unit:
|
||||
scripts/test-unstructured-api-unit.sh
|
||||
|
||||
.PHONY: test-no-extras
|
||||
# TODO(newelh) Add json test when fixed
|
||||
test-no-extras:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/test_text.py \
|
||||
test_${PACKAGE_NAME}/partition/test_email.py \
|
||||
test_${PACKAGE_NAME}/partition/test_html_partition.py \
|
||||
test_${PACKAGE_NAME}/partition/test_xml_partition.py
|
||||
|
||||
.PHONY: test-extra-csv
|
||||
test-extra-csv:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/csv
|
||||
|
||||
.PHONY: test-extra-docx
|
||||
test-extra-docx:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/docx
|
||||
|
||||
.PHONY: test-extra-markdown
|
||||
test-extra-markdown:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/markdown
|
||||
|
||||
.PHONY: test-extra-msg
|
||||
test-extra-msg:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/msg
|
||||
|
||||
.PHONY: test-extra-odt
|
||||
test-extra-odt:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/odt
|
||||
|
||||
.PHONY: test-extra-pdf-image
|
||||
test-extra-pdf-image:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/pdf-image
|
||||
|
||||
.PHONY: test-extra-pptx
|
||||
test-extra-pptx:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/pptx
|
||||
|
||||
.PHONY: test-extra-epub
|
||||
test-extra-pypandoc:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/epub
|
||||
|
||||
.PHONY: test-extra-pypandoc
|
||||
test-extra-pypandoc:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/pypandoc
|
||||
|
||||
.PHONY: test-extra-xlsx
|
||||
test-extra-xlsx:
|
||||
PYTHONPATH=. CI=$(CI) pytest \
|
||||
test_${PACKAGE_NAME}/partition/xlsx
|
||||
|
||||
## check: runs linters (includes tests)
|
||||
.PHONY: check
|
||||
check: check-src check-tests check-version
|
||||
|
@ -19,7 +19,7 @@ expected_sections = {
|
||||
|
||||
|
||||
def test_partition_epub_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
|
||||
elements = partition_epub(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||
@ -32,7 +32,7 @@ def test_partition_epub_from_filename():
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
|
||||
elements = partition_epub(filename=filename, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
@ -40,7 +40,7 @@ def test_partition_epub_from_filename_with_metadata_filename():
|
||||
|
||||
|
||||
def test_partition_epub_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_epub(file=f)
|
||||
assert len(elements) > 0
|
||||
@ -53,7 +53,7 @@ def test_partition_epub_from_file():
|
||||
|
||||
|
||||
def test_partition_epub_from_file_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_epub(file=f, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
@ -62,7 +62,7 @@ def test_partition_epub_from_file_with_metadata_filename():
|
||||
|
||||
|
||||
def test_partition_epub_from_filename_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
|
||||
elements = partition_epub(filename=filename, include_metadata=False)
|
||||
assert elements[0].metadata.filetype is None
|
||||
assert elements[0].metadata.page_name is None
|
||||
@ -71,7 +71,7 @@ def test_partition_epub_from_filename_exclude_metadata():
|
||||
|
||||
|
||||
def test_partition_epub_from_file_exlcude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_epub(file=f, include_metadata=False)
|
||||
assert elements[0].metadata.filetype is None
|
@ -11,7 +11,7 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
|
||||
def test_partition_md_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
elements = partition_md(filename=filename)
|
||||
assert "PageBreak" not in [elem.category for elem in elements]
|
||||
assert len(elements) > 0
|
||||
@ -20,7 +20,7 @@ def test_partition_md_from_filename():
|
||||
|
||||
|
||||
def test_partition_md_from_filename_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
elements = partition_md(filename=filename, metadata_filename="test")
|
||||
assert "PageBreak" not in [elem.category for elem in elements]
|
||||
assert len(elements) > 0
|
||||
@ -29,7 +29,7 @@ def test_partition_md_from_filename_with_metadata_filename():
|
||||
|
||||
|
||||
def test_partition_md_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
elements = partition_md(file=f)
|
||||
assert len(elements) > 0
|
||||
@ -38,7 +38,7 @@ def test_partition_md_from_file():
|
||||
|
||||
|
||||
def test_partition_md_from_file_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
elements = partition_md(file=f, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
@ -46,7 +46,7 @@ def test_partition_md_from_file_with_metadata_filename():
|
||||
|
||||
|
||||
def test_partition_md_from_text():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
elements = partition_md(text=text)
|
||||
@ -64,7 +64,7 @@ class MockResponse:
|
||||
|
||||
|
||||
def test_partition_md_from_url():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
@ -82,7 +82,7 @@ def test_partition_md_from_url():
|
||||
|
||||
|
||||
def test_partition_md_from_url_raises_with_bad_status_code():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
@ -97,7 +97,7 @@ def test_partition_md_from_url_raises_with_bad_status_code():
|
||||
|
||||
|
||||
def test_partition_md_from_url_raises_with_bad_content_type():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
@ -117,7 +117,7 @@ def test_partition_md_raises_with_none_specified():
|
||||
|
||||
|
||||
def test_partition_md_raises_with_too_many_specified():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
|
||||
@ -126,14 +126,14 @@ def test_partition_md_raises_with_too_many_specified():
|
||||
|
||||
|
||||
def test_partition_md_from_filename_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
elements = partition_md(filename=filename, include_metadata=False)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i].metadata.to_dict() == {}
|
||||
|
||||
|
||||
def test_partition_md_from_file_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
elements = partition_md(file=f, include_metadata=False)
|
||||
for i in range(len(elements)):
|
||||
@ -141,7 +141,7 @@ def test_partition_md_from_file_exclude_metadata():
|
||||
|
||||
|
||||
def test_partition_md_from_text_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "README.md")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "README.md")
|
||||
with open(filename) as f:
|
||||
text = f.read()
|
||||
elements = partition_md(text=text, include_metadata=False)
|
@ -14,7 +14,7 @@ from unstructured.partition.msg import extract_msg_attachment_info, partition_ms
|
||||
from unstructured.partition.text import partition_text
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
|
||||
|
||||
EXPECTED_MSG_OUTPUT = [
|
||||
NarrativeText(text="This is a test email to use for unit tests."),
|
||||
@ -104,6 +104,7 @@ def test_extract_attachment_info():
|
||||
DIRECTORY,
|
||||
"..",
|
||||
"..",
|
||||
"..",
|
||||
"example-docs",
|
||||
"fake-email-attachment.msg",
|
||||
)
|
@ -5,7 +5,7 @@ from unstructured.documents.elements import Title
|
||||
from unstructured.partition.odt import partition_odt
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
|
||||
|
||||
|
||||
def test_partition_odt_from_filename():
|
@ -177,6 +177,7 @@ def test_partition_image_with_ocr_detects_korean():
|
||||
DIRECTORY,
|
||||
"..",
|
||||
"..",
|
||||
"..",
|
||||
"example-docs",
|
||||
"english-and-korean.png",
|
||||
)
|
||||
@ -191,7 +192,7 @@ def test_partition_image_with_ocr_detects_korean():
|
||||
|
||||
|
||||
def test_partition_image_with_ocr_detects_korean_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "english-and-korean.png")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "english-and-korean.png")
|
||||
with open(filename, "rb") as f:
|
||||
elements = image.partition_image(
|
||||
file=f,
|
||||
@ -208,6 +209,7 @@ def test_partition_image_raises_with_bad_strategy():
|
||||
DIRECTORY,
|
||||
"..",
|
||||
"..",
|
||||
"..",
|
||||
"example-docs",
|
||||
"english-and-korean.png",
|
||||
)
|
||||
@ -216,7 +218,14 @@ def test_partition_image_raises_with_bad_strategy():
|
||||
|
||||
|
||||
def test_partition_image_default_strategy_hi_res():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "layout-parser-paper-fast.jpg")
|
||||
filename = os.path.join(
|
||||
DIRECTORY,
|
||||
"..",
|
||||
"..",
|
||||
"..",
|
||||
"example-docs",
|
||||
"layout-parser-paper-fast.jpg",
|
||||
)
|
||||
with open(filename, "rb") as f:
|
||||
elements = image.partition_image(file=f)
|
||||
|
@ -7,7 +7,7 @@ from unstructured.documents.elements import ListItem, NarrativeText, Title
|
||||
from unstructured.partition.ppt import partition_ppt
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
|
||||
|
||||
EXPECTED_PPT_OUTPUT = [
|
||||
Title(text="Adding a Bullet Slide"),
|
@ -14,7 +14,7 @@ from unstructured.documents.elements import (
|
||||
from unstructured.partition.pptx import partition_pptx
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "..", "example-docs")
|
||||
|
||||
EXPECTED_PPTX_OUTPUT = [
|
||||
Title(text="Adding a Bullet Slide"),
|
@ -8,7 +8,7 @@ DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
elements = partition_rtf(filename=filename)
|
||||
assert len(elements) > 0
|
||||
assert elements[0] == Title("My First Heading")
|
||||
@ -17,14 +17,14 @@ def test_partition_rtf_from_filename():
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
elements = partition_rtf(filename=filename, metadata_filename="test")
|
||||
assert len(elements) > 0
|
||||
assert all(element.metadata.filename == "test" for element in elements)
|
||||
|
||||
|
||||
def test_partition_rtf_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_rtf(file=f)
|
||||
assert len(elements) > 0
|
||||
@ -34,7 +34,7 @@ def test_partition_rtf_from_file():
|
||||
|
||||
|
||||
def test_partition_rtf_from_file_with_metadata_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_rtf(file=f, metadata_filename="test")
|
||||
assert elements[0] == Title("My First Heading")
|
||||
@ -43,14 +43,14 @@ def test_partition_rtf_from_file_with_metadata_filename():
|
||||
|
||||
|
||||
def test_partition_rtf_from_filename_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
elements = partition_rtf(filename=filename, include_metadata=False)
|
||||
for i in range(len(elements)):
|
||||
assert elements[i].metadata.to_dict() == {}
|
||||
|
||||
|
||||
def test_partition_rtf_from_file_exclude_metadata():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "fake-doc.rtf")
|
||||
with open(filename, "rb") as f:
|
||||
elements = partition_rtf(file=f, include_metadata=False)
|
||||
for i in range(len(elements)):
|
@ -25,8 +25,8 @@ from unstructured.logger import logger
|
||||
from unstructured.nlp.patterns import ENUMERATED_BULLETS_RE, UNICODE_BULLETS_RE
|
||||
from unstructured.utils import dependency_exists
|
||||
|
||||
if dependency_exists("docx"):
|
||||
import docx.table as docxtable
|
||||
if dependency_exists("docx") and dependency_exists("docx.table"):
|
||||
from docx.table import Table as docxtable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from unstructured_inference.inference.layoutelement import (
|
||||
|
Loading…
x
Reference in New Issue
Block a user