mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Chore: Support epub tests in docker image (#630)
* docker works * more epub tests * changelog version * support epub + odt + rtf * update dockerfile * revert.. * install pandoc on ci env * pandoc docker grab bashed on arch * move arch into image * move back to base image
This commit is contained in:
parent
c5d9469001
commit
fc59a043b7
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@ -134,7 +134,8 @@ jobs:
|
|||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
make install-detectron2
|
make install-detectron2
|
||||||
sudo apt-get update
|
sudo apt-get update
|
||||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
|
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||||
|
make install-pandoc
|
||||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||||
tesseract --version
|
tesseract --version
|
||||||
@ -228,4 +229,3 @@ jobs:
|
|||||||
source .venv/bin/activate
|
source .venv/bin/activate
|
||||||
make docker-build
|
make docker-build
|
||||||
make docker-test
|
make docker-test
|
||||||
|
|
||||||
|
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,3 +1,15 @@
|
|||||||
|
## 0.6.11-dev0
|
||||||
|
|
||||||
|
### Enhancements
|
||||||
|
|
||||||
|
* Supprts epub tests since pandoc is updated in base image
|
||||||
|
|
||||||
|
### Features
|
||||||
|
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
|
|
||||||
## 0.6.10
|
## 0.6.10
|
||||||
|
|
||||||
### Enhancements
|
### Enhancements
|
||||||
|
@ -1,6 +1,6 @@
|
|||||||
# syntax=docker/dockerfile:experimental
|
# syntax=docker/dockerfile:experimental
|
||||||
|
|
||||||
FROM quay.io/unstructured-io/base-images:centos7.9
|
FROM quay.io/unstructured-io/base-images:centos7.9-1
|
||||||
|
|
||||||
ARG PIP_VERSION
|
ARG PIP_VERSION
|
||||||
|
|
||||||
|
5
Makefile
5
Makefile
@ -105,6 +105,11 @@ install-detectron2: install-tensorboard
|
|||||||
.PHONY: install-local-inference
|
.PHONY: install-local-inference
|
||||||
install-local-inference: install install-unstructured-inference install-detectron2
|
install-local-inference: install install-unstructured-inference install-detectron2
|
||||||
|
|
||||||
|
.PHONY: install-pandoc
|
||||||
|
install-pandoc:
|
||||||
|
ARCH=${ARCH} ./scripts/install-pandoc.sh
|
||||||
|
|
||||||
|
|
||||||
## pip-compile: compiles all base/dev/test requirements
|
## pip-compile: compiles all base/dev/test requirements
|
||||||
.PHONY: pip-compile
|
.PHONY: pip-compile
|
||||||
pip-compile:
|
pip-compile:
|
||||||
|
16
scripts/install-pandoc.sh
Executable file
16
scripts/install-pandoc.sh
Executable file
@ -0,0 +1,16 @@
|
|||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
# Mainly used for installing pandoc on CI
|
||||||
|
set -euo pipefail
|
||||||
|
if [ "${ARCH}" = "x86_64" ]; then
|
||||||
|
export PANDOC_ARCH="amd64"
|
||||||
|
elif [ "${ARCH}" = "arm64" ] || [ "${ARCH}" = "aarch64" ]; then
|
||||||
|
export PANDOC_ARCH="arm64"
|
||||||
|
fi
|
||||||
|
|
||||||
|
wget https://github.com/jgm/pandoc/releases/download/3.1.2/pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz
|
||||||
|
tar xvf pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz
|
||||||
|
cd pandoc-3.1.2
|
||||||
|
sudo cp bin/pandoc /usr/local/bin/
|
||||||
|
cd ..
|
||||||
|
rm -rf pandoc-3.1.2*
|
@ -10,7 +10,6 @@ from unstructured.file_utils.file_conversion import convert_file_to_text
|
|||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.xfail(reason="Requirements mismatch, should only fail in docker test")
|
|
||||||
def test_convert_file_to_text():
|
def test_convert_file_to_text():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||||
html_text = convert_file_to_text(filename, source_format="epub", target_format="html")
|
html_text = convert_file_to_text(filename, source_format="epub", target_format="html")
|
||||||
|
@ -6,7 +6,6 @@ from importlib import import_module
|
|||||||
from unittest.mock import patch
|
from unittest.mock import patch
|
||||||
|
|
||||||
import docx
|
import docx
|
||||||
import pypandoc
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from unstructured.cleaners.core import clean_extra_whitespace
|
from unstructured.cleaners.core import clean_extra_whitespace
|
||||||
@ -37,8 +36,6 @@ EXPECTED_EMAIL_OUTPUT = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
is_in_docker = os.path.exists("/.dockerenv")
|
is_in_docker = os.path.exists("/.dockerenv")
|
||||||
rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
|
|
||||||
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
|
|
||||||
|
|
||||||
|
|
||||||
def test_auto_partition_email_from_filename():
|
def test_auto_partition_email_from_filename():
|
||||||
@ -432,7 +429,6 @@ def test_auto_with_page_breaks():
|
|||||||
assert PageBreak() in elements
|
assert PageBreak() in elements
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
def test_auto_partition_epub_from_filename():
|
def test_auto_partition_epub_from_filename():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||||
elements = partition(filename=filename, strategy="hi_res")
|
elements = partition(filename=filename, strategy="hi_res")
|
||||||
@ -440,7 +436,6 @@ def test_auto_partition_epub_from_filename():
|
|||||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
def test_auto_partition_epub_from_file():
|
def test_auto_partition_epub_from_file():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
@ -463,8 +458,6 @@ def test_auto_partition_msg_from_filename():
|
|||||||
assert elements == EXPECTED_MSG_OUTPUT
|
assert elements == EXPECTED_MSG_OUTPUT
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
@pytest.mark.skipif(rtf_not_supported, reason="RTF not supported in this version of pypandoc.")
|
|
||||||
def test_auto_partition_rtf_from_filename():
|
def test_auto_partition_rtf_from_filename():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
|
||||||
elements = partition(filename=filename, strategy="hi_res")
|
elements = partition(filename=filename, strategy="hi_res")
|
||||||
@ -510,16 +503,12 @@ def test_auto_partition_works_with_unstructured_jsons_from_file():
|
|||||||
assert elements[0].text == "News Around NOAA"
|
assert elements[0].text == "News Around NOAA"
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
|
||||||
def test_auto_partition_odt_from_filename():
|
def test_auto_partition_odt_from_filename():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||||
elements = partition(filename=filename, strategy="hi_res")
|
elements = partition(filename=filename, strategy="hi_res")
|
||||||
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
|
||||||
def test_auto_partition_odt_from_file():
|
def test_auto_partition_odt_from_file():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
@ -591,12 +580,6 @@ FILETYPE_TO_MODULE = {
|
|||||||
def test_file_specific_produces_correct_filetype(filetype: FileType):
|
def test_file_specific_produces_correct_filetype(filetype: FileType):
|
||||||
if filetype in (FileType.JPG, FileType.PNG):
|
if filetype in (FileType.JPG, FileType.PNG):
|
||||||
pytest.skip()
|
pytest.skip()
|
||||||
if (filetype is FileType.RTF) and (is_in_docker or rtf_not_supported):
|
|
||||||
pytest.skip()
|
|
||||||
if (filetype is FileType.ODT) and (is_in_docker or odt_not_supported):
|
|
||||||
pytest.skip()
|
|
||||||
if (filetype is FileType.EPUB) and is_in_docker:
|
|
||||||
pytest.skip()
|
|
||||||
extension = filetype.name.lower()
|
extension = filetype.name.lower()
|
||||||
filetype_module = (
|
filetype_module = (
|
||||||
extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype]
|
extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype]
|
||||||
|
@ -1,16 +1,11 @@
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from unstructured.partition.epub import partition_epub
|
from unstructured.partition.epub import partition_epub
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
is_in_docker = os.path.exists("/.dockerenv")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
def test_partition_epub_from_filename():
|
def test_partition_epub_from_filename():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||||
elements = partition_epub(filename=filename)
|
elements = partition_epub(filename=filename)
|
||||||
@ -18,7 +13,6 @@ def test_partition_epub_from_filename():
|
|||||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
def test_partition_epub_from_file():
|
def test_partition_epub_from_file():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
|
@ -1,29 +1,19 @@
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
import pypandoc
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from unstructured.documents.elements import Title
|
from unstructured.documents.elements import Title
|
||||||
from unstructured.partition.odt import partition_odt
|
from unstructured.partition.odt import partition_odt
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||||
|
|
||||||
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
|
|
||||||
is_in_docker = os.path.exists("/.dockerenv")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
|
||||||
def test_partition_odt_from_filename():
|
def test_partition_odt_from_filename():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||||
elements = partition_odt(filename=filename)
|
elements = partition_odt(filename=filename)
|
||||||
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
|
||||||
def test_partition_odt_from_file():
|
def test_partition_odt_from_file():
|
||||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
|
@ -1,20 +1,12 @@
|
|||||||
import os
|
import os
|
||||||
import pathlib
|
import pathlib
|
||||||
|
|
||||||
import pypandoc
|
|
||||||
import pytest
|
|
||||||
|
|
||||||
from unstructured.documents.elements import Title
|
from unstructured.documents.elements import Title
|
||||||
from unstructured.partition.rtf import partition_rtf
|
from unstructured.partition.rtf import partition_rtf
|
||||||
|
|
||||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||||
|
|
||||||
rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
|
|
||||||
is_in_docker = os.path.exists("/.dockerenv")
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
@pytest.mark.skipif(rtf_not_supported, reason="RTF not supported in this version of pypandoc.")
|
|
||||||
def test_partition_rtf_from_filename():
|
def test_partition_rtf_from_filename():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||||
elements = partition_rtf(filename=filename)
|
elements = partition_rtf(filename=filename)
|
||||||
@ -22,8 +14,6 @@ def test_partition_rtf_from_filename():
|
|||||||
assert elements[0] == Title("My First Heading")
|
assert elements[0] == Title("My First Heading")
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
|
||||||
@pytest.mark.skipif(rtf_not_supported, reason="RTF not supported in this version of pypandoc.")
|
|
||||||
def test_partition_rtf_from_file():
|
def test_partition_rtf_from_file():
|
||||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||||
with open(filename, "rb") as f:
|
with open(filename, "rb") as f:
|
||||||
|
@ -1 +1 @@
|
|||||||
__version__ = "0.6.10" # pragma: no cover
|
__version__ = "0.6.11-dev0" # pragma: no cover
|
||||||
|
Loading…
x
Reference in New Issue
Block a user