mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-06-27 02:30:08 +00:00
Chore: Support epub tests in docker image (#630)
* docker works * more epub tests * changelog version * support epub + odt + rtf * update dockerfile * revert.. * install pandoc on ci env * pandoc docker grab bashed on arch * move arch into image * move back to base image
This commit is contained in:
parent
c5d9469001
commit
fc59a043b7
4
.github/workflows/ci.yml
vendored
4
.github/workflows/ci.yml
vendored
@ -134,7 +134,8 @@ jobs:
|
||||
source .venv/bin/activate
|
||||
make install-detectron2
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice
|
||||
make install-pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get install -y tesseract-ocr tesseract-ocr-kor
|
||||
tesseract --version
|
||||
@ -228,4 +229,3 @@ jobs:
|
||||
source .venv/bin/activate
|
||||
make docker-build
|
||||
make docker-test
|
||||
|
||||
|
12
CHANGELOG.md
12
CHANGELOG.md
@ -1,3 +1,15 @@
|
||||
## 0.6.11-dev0
|
||||
|
||||
### Enhancements
|
||||
|
||||
* Supprts epub tests since pandoc is updated in base image
|
||||
|
||||
### Features
|
||||
|
||||
|
||||
### Fixes
|
||||
|
||||
|
||||
## 0.6.10
|
||||
|
||||
### Enhancements
|
||||
|
@ -1,6 +1,6 @@
|
||||
# syntax=docker/dockerfile:experimental
|
||||
|
||||
FROM quay.io/unstructured-io/base-images:centos7.9
|
||||
FROM quay.io/unstructured-io/base-images:centos7.9-1
|
||||
|
||||
ARG PIP_VERSION
|
||||
|
||||
|
5
Makefile
5
Makefile
@ -105,6 +105,11 @@ install-detectron2: install-tensorboard
|
||||
.PHONY: install-local-inference
|
||||
install-local-inference: install install-unstructured-inference install-detectron2
|
||||
|
||||
.PHONY: install-pandoc
|
||||
install-pandoc:
|
||||
ARCH=${ARCH} ./scripts/install-pandoc.sh
|
||||
|
||||
|
||||
## pip-compile: compiles all base/dev/test requirements
|
||||
.PHONY: pip-compile
|
||||
pip-compile:
|
||||
|
16
scripts/install-pandoc.sh
Executable file
16
scripts/install-pandoc.sh
Executable file
@ -0,0 +1,16 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Mainly used for installing pandoc on CI
|
||||
set -euo pipefail
|
||||
if [ "${ARCH}" = "x86_64" ]; then
|
||||
export PANDOC_ARCH="amd64"
|
||||
elif [ "${ARCH}" = "arm64" ] || [ "${ARCH}" = "aarch64" ]; then
|
||||
export PANDOC_ARCH="arm64"
|
||||
fi
|
||||
|
||||
wget https://github.com/jgm/pandoc/releases/download/3.1.2/pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz
|
||||
tar xvf pandoc-3.1.2-linux-"${PANDOC_ARCH}".tar.gz
|
||||
cd pandoc-3.1.2
|
||||
sudo cp bin/pandoc /usr/local/bin/
|
||||
cd ..
|
||||
rm -rf pandoc-3.1.2*
|
@ -10,7 +10,6 @@ from unstructured.file_utils.file_conversion import convert_file_to_text
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
|
||||
@pytest.mark.xfail(reason="Requirements mismatch, should only fail in docker test")
|
||||
def test_convert_file_to_text():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
html_text = convert_file_to_text(filename, source_format="epub", target_format="html")
|
||||
|
@ -6,7 +6,6 @@ from importlib import import_module
|
||||
from unittest.mock import patch
|
||||
|
||||
import docx
|
||||
import pypandoc
|
||||
import pytest
|
||||
|
||||
from unstructured.cleaners.core import clean_extra_whitespace
|
||||
@ -37,8 +36,6 @@ EXPECTED_EMAIL_OUTPUT = [
|
||||
]
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
|
||||
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
|
||||
|
||||
|
||||
def test_auto_partition_email_from_filename():
|
||||
@ -432,7 +429,6 @@ def test_auto_with_page_breaks():
|
||||
assert PageBreak() in elements
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partition_epub_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
elements = partition(filename=filename, strategy="hi_res")
|
||||
@ -440,7 +436,6 @@ def test_auto_partition_epub_from_filename():
|
||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_auto_partition_epub_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
@ -463,8 +458,6 @@ def test_auto_partition_msg_from_filename():
|
||||
assert elements == EXPECTED_MSG_OUTPUT
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(rtf_not_supported, reason="RTF not supported in this version of pypandoc.")
|
||||
def test_auto_partition_rtf_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake-doc.rtf")
|
||||
elements = partition(filename=filename, strategy="hi_res")
|
||||
@ -510,16 +503,12 @@ def test_auto_partition_works_with_unstructured_jsons_from_file():
|
||||
assert elements[0].text == "News Around NOAA"
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
||||
def test_auto_partition_odt_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
elements = partition(filename=filename, strategy="hi_res")
|
||||
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
||||
def test_auto_partition_odt_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
with open(filename, "rb") as f:
|
||||
@ -591,12 +580,6 @@ FILETYPE_TO_MODULE = {
|
||||
def test_file_specific_produces_correct_filetype(filetype: FileType):
|
||||
if filetype in (FileType.JPG, FileType.PNG):
|
||||
pytest.skip()
|
||||
if (filetype is FileType.RTF) and (is_in_docker or rtf_not_supported):
|
||||
pytest.skip()
|
||||
if (filetype is FileType.ODT) and (is_in_docker or odt_not_supported):
|
||||
pytest.skip()
|
||||
if (filetype is FileType.EPUB) and is_in_docker:
|
||||
pytest.skip()
|
||||
extension = filetype.name.lower()
|
||||
filetype_module = (
|
||||
extension if filetype not in FILETYPE_TO_MODULE else FILETYPE_TO_MODULE[filetype]
|
||||
|
@ -1,16 +1,11 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
from unstructured.partition.epub import partition_epub
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_partition_epub_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
elements = partition_epub(filename=filename)
|
||||
@ -18,7 +13,6 @@ def test_partition_epub_from_filename():
|
||||
assert elements[0].text.startswith("The Project Gutenberg eBook of Winter Sports")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
def test_partition_epub_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "winter-sports.epub")
|
||||
with open(filename, "rb") as f:
|
||||
|
@ -1,29 +1,19 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pypandoc
|
||||
import pytest
|
||||
|
||||
from unstructured.documents.elements import Title
|
||||
from unstructured.partition.odt import partition_odt
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
EXAMPLE_DOCS_DIRECTORY = os.path.join(DIRECTORY, "..", "..", "example-docs")
|
||||
|
||||
odt_not_supported = "odt" not in pypandoc.get_pandoc_formats()[0]
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
||||
def test_partition_odt_from_filename():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
elements = partition_odt(filename=filename)
|
||||
assert elements == [Title("Lorem ipsum dolor sit amet.")]
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(odt_not_supported, reason="odt not supported in this version of pypandoc.")
|
||||
def test_partition_odt_from_file():
|
||||
filename = os.path.join(EXAMPLE_DOCS_DIRECTORY, "fake.odt")
|
||||
with open(filename, "rb") as f:
|
||||
|
@ -1,20 +1,12 @@
|
||||
import os
|
||||
import pathlib
|
||||
|
||||
import pypandoc
|
||||
import pytest
|
||||
|
||||
from unstructured.documents.elements import Title
|
||||
from unstructured.partition.rtf import partition_rtf
|
||||
|
||||
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
||||
|
||||
rtf_not_supported = "rtf" not in pypandoc.get_pandoc_formats()[0]
|
||||
is_in_docker = os.path.exists("/.dockerenv")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(rtf_not_supported, reason="RTF not supported in this version of pypandoc.")
|
||||
def test_partition_rtf_from_filename():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||
elements = partition_rtf(filename=filename)
|
||||
@ -22,8 +14,6 @@ def test_partition_rtf_from_filename():
|
||||
assert elements[0] == Title("My First Heading")
|
||||
|
||||
|
||||
@pytest.mark.skipif(is_in_docker, reason="Skipping this test in Docker container")
|
||||
@pytest.mark.skipif(rtf_not_supported, reason="RTF not supported in this version of pypandoc.")
|
||||
def test_partition_rtf_from_file():
|
||||
filename = os.path.join(DIRECTORY, "..", "..", "example-docs", "fake-doc.rtf")
|
||||
with open(filename, "rb") as f:
|
||||
|
@ -1 +1 @@
|
||||
__version__ = "0.6.10" # pragma: no cover
|
||||
__version__ = "0.6.11-dev0" # pragma: no cover
|
||||
|
Loading…
x
Reference in New Issue
Block a user