Chore: Support epub tests in docker image (#630)

* docker works

* more epub tests

* changelog version

* support epub + odt + rtf

* update dockerfile

* revert..

* install pandoc on ci env

* pandoc docker grab bashed on arch

* move arch into image

* move back to base image
This commit is contained in:
Yuming Long 2023-05-26 15:38:48 -04:00 committed by GitHub
parent c5d9469001
commit fc59a043b7
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 37 additions and 48 deletions

View File

@ -134,7 +134,8 @@ jobs:
source .venv/bin/activate source .venv/bin/activate
make install-detectron2 make install-detectron2
sudo apt-get update sudo apt-get update
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc sudo apt-get install -y libmagic-dev poppler-utils libreoffice
make install-pandoc
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5 sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
tesseract --version tesseract --version
@ -228,4 +229,3 @@ jobs:
source .venv/bin/activate source .venv/bin/activate
make docker-build make docker-build
make docker-test make docker-test

View File

@ -1,3 +1,15 @@
## 0.6.11-dev0
### Enhancements
* Supprts epub tests since pandoc is updated in base image
### Features
### Fixes
## 0.6.10 ## 0.6.10
### Enhancements ### Enhancements

View File

@ -1,6 +1,6 @@
# syntax=docker/dockerfile:experimental # syntax=docker/dockerfile:experimental
FROM quay.io/unstructured-io/base-images:centos7.9 FROM quay.io/unstructured-io/base-images:centos7.9-1
ARG PIP_VERSION ARG PIP_VERSION

View File

@ -105,6 +105,11 @@ install-detectron2: install-tensorboard
.PHONY: install-local-inference .PHONY: install-local-inference
install-local-inference: install install-unstructured-inference install-detectron2 install-local-inference: install install-unstructured-inference install-detectron2
.PHONY: install-pandoc
install-pandoc:
ARCH=${ARCH} ./scripts/install-pandoc.sh
## pip-compile: compiles all base/dev/test requirements ## pip-compile: compiles all base/dev/test requirements
.PHONY: pip-compile .PHONY: pip-compile
pip-compile: pip-compile:

16
scripts/install-pandoc.sh Executable file
View File

@ -0,0 +1,16 @@
#!/bin/bash
# Mainly used for installing pandoc on CI
set -euo pipefail
if [ "${ARCH}" = "x86_64" ]; then
export PANDOC_ARCH="amd64"
elif [ "${ARCH}" = "arm64" ] || [ "${ARCH}" = "aarch64" ]; then
export PANDOC_ARCH="arm64"
fi
wget https://github.com/jgm/pandoc/releases/download/3.1.2/pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz
tar xvf pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz
cd pandoc-3.1.2
sudo cp bin/pandoc /usr/local/bin/
cd ..
rm -rf pandoc-3.1.2*

View File

@ -10,7 +10,6 @@ from unstructured.file_utils.file_conversion import convert_file_to_text
DIRECTORY = pathlib.Path(__file__).parent.resolve() DIRECTORY = pathlib.Path(__file__).parent.resolve()
@pytest.mark.xfail(reason="Requirements mismatch, should only fail in docker test")
def test_convert_file_to_text(): def test_convert_file_to_text():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
html_text = convert_file_to_text(filename, source_format="epub", target_format="html") html_text = convert_file_to_text(filename, source_format="epub", target_format="html")

View File

@ -6,7 +6,6 @@ from importlib import import_module
from unittest.mock import patch from unittest.mock import patch
import docx import docx
import pypandoc
import pytest import pytest
from unstructured.cleaners.core import clean_extra_whitespace from unstructured.cleaners.core import clean_extra_whitespace
@ -37,8 +36,6 @@ EXPECTED_EMAIL_OUTPUT = [
] ]
is_in_docker = os.path.exists("/.dockerenv") is_in_docker = os.path.exists("/.dockerenv")
rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
def test_auto_partition_email_from_filename(): def test_auto_partition_email_from_filename():
@ -432,7 +429,6 @@ def test_auto_with_page_breaks():
assert PageBreak() in elements assert PageBreak() in elements
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_epub_from_filename(): def test_auto_partition_epub_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
elements = partition(filename=filename, strategy="hi_res") elements = partition(filename=filename, strategy="hi_res")
@ -440,7 +436,6 @@ def test_auto_partition_epub_from_filename():
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_auto_partition_epub_from_file(): def test_auto_partition_epub_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
with open(filename, "rb") as f: with open(filename, "rb") as f:
@ -463,8 +458,6 @@ def test_auto_partition_msg_from_filename():
assert elements == EXPECTED_MSG_OUTPUT assert elements == EXPECTED_MSG_OUTPUT
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(rtf_not_supported, reason="RTF not supported in this version of pypandoc.")
def test_auto_partition_rtf_from_filename(): def test_auto_partition_rtf_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
elements = partition(filename=filename, strategy="hi_res") elements = partition(filename=filename, strategy="hi_res")
@ -510,16 +503,12 @@ def test_auto_partition_works_with_unstructured_jsons_from_file():
assert elements[0].text == "News Around NOAA" assert elements[0].text == "News Around NOAA"
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
def test_auto_partition_odt_from_filename(): def test_auto_partition_odt_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
elements = partition(filename=filename, strategy="hi_res") elements = partition(filename=filename, strategy="hi_res")
assert elements == [Title("Lorem ipsum dolor sit amet.")] assert elements == [Title("Lorem ipsum dolor sit amet.")]
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
def test_auto_partition_odt_from_file(): def test_auto_partition_odt_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
with open(filename, "rb") as f: with open(filename, "rb") as f:
@ -591,12 +580,6 @@ FILETYPE_TO_MODULE = {
def test_file_specific_produces_correct_filetype(filetype: FileType): def test_file_specific_produces_correct_filetype(filetype: FileType):
if filetype in (FileType.JPG, FileType.PNG): if filetype in (FileType.JPG, FileType.PNG):
pytest.skip() pytest.skip()
if (filetype is FileType.RTF) and (is_in_docker or rtf_not_supported):
pytest.skip()
if (filetype is FileType.ODT) and (is_in_docker or odt_not_supported):
pytest.skip()
if (filetype is FileType.EPUB) and is_in_docker:
pytest.skip()
extension = filetype.name.lower() extension = filetype.name.lower()
filetype_module = ( filetype_module = (
extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype] extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype]

View File

@ -1,16 +1,11 @@
import os import os
import pathlib import pathlib
import pytest
from unstructured.partition.epub import partition_epub from unstructured.partition.epub import partition_epub
DIRECTORY = pathlib.Path(__file__).parent.resolve() DIRECTORY = pathlib.Path(__file__).parent.resolve()
is_in_docker = os.path.exists("/.dockerenv")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_partition_epub_from_filename(): def test_partition_epub_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
elements = partition_epub(filename=filename) elements = partition_epub(filename=filename)
@ -18,7 +13,6 @@ def test_partition_epub_from_filename():
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports") assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
def test_partition_epub_from_file(): def test_partition_epub_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
with open(filename, "rb") as f: with open(filename, "rb") as f:

View File

@ -1,29 +1,19 @@
import os import os
import pathlib import pathlib
import pypandoc
import pytest
from unstructured.documents.elements import Title from unstructured.documents.elements import Title
from unstructured.partition.odt import partition_odt from unstructured.partition.odt import partition_odt
DIRECTORY = pathlib.Path(__file__).parent.resolve() DIRECTORY = pathlib.Path(__file__).parent.resolve()
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs") EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
is_in_docker = os.path.exists("/.dockerenv")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
def test_partition_odt_from_filename(): def test_partition_odt_from_filename():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
elements = partition_odt(filename=filename) elements = partition_odt(filename=filename)
assert elements == [Title("Lorem ipsum dolor sit amet.")] assert elements == [Title("Lorem ipsum dolor sit amet.")]
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
def test_partition_odt_from_file(): def test_partition_odt_from_file():
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt") filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
with open(filename, "rb") as f: with open(filename, "rb") as f:

View File

@ -1,20 +1,12 @@
import os import os
import pathlib import pathlib
import pypandoc
import pytest
from unstructured.documents.elements import Title from unstructured.documents.elements import Title
from unstructured.partition.rtf import partition_rtf from unstructured.partition.rtf import partition_rtf
DIRECTORY = pathlib.Path(__file__).parent.resolve() DIRECTORY = pathlib.Path(__file__).parent.resolve()
rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
is_in_docker = os.path.exists("/.dockerenv")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(rtf_not_supported, reason="RTF not supported in this version of pypandoc.")
def test_partition_rtf_from_filename(): def test_partition_rtf_from_filename():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
elements = partition_rtf(filename=filename) elements = partition_rtf(filename=filename)
@ -22,8 +14,6 @@ def test_partition_rtf_from_filename():
assert elements[0] == Title("My First Heading") assert elements[0] == Title("My First Heading")
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
@pytest.mark.skipif(rtf_not_supported, reason="RTF not supported in this version of pypandoc.")
def test_partition_rtf_from_file(): def test_partition_rtf_from_file():
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf") filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
with open(filename, "rb") as f: with open(filename, "rb") as f:

View File

@ -1 +1 @@
__version__ = "0.6.10" # pragma: no cover __version__ = "0.6.11-dev0" # pragma: no cover