mirror of
https://github.com/Unstructured-IO/unstructured.git
synced 2025-12-27 15:13:35 +00:00
fix: parameterized ingest test overwriting (#838)
* sets OVERWRITE_FIXTURES to default to false in test-ingest-local-single-file.sh * fixes incorrect expected results * update expected results to properly parse Korean text * bonus: installs language pack for Korean in CI and ingest fixture workflows
This commit is contained in:
parent
60fe231f08
commit
8ea5f6939e
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -194,6 +194,7 @@ jobs:
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get install -y tesseract-ocr
|
||||
sudo apt-get install -y tesseract-ocr-kor
|
||||
tesseract --version
|
||||
make install-ingest-s3
|
||||
make install-ingest-azure
|
||||
|
||||
@ -70,6 +70,7 @@ jobs:
|
||||
sudo apt-get install -y libmagic-dev poppler-utils libreoffice pandoc
|
||||
sudo add-apt-repository -y ppa:alex-p/tesseract-ocr5
|
||||
sudo apt-get install -y tesseract-ocr
|
||||
sudo apt-get install -y tesseract-ocr-kor
|
||||
tesseract --version
|
||||
make install-ingest-s3
|
||||
make install-ingest-azure
|
||||
|
||||
@ -27,13 +27,13 @@
|
||||
"text": "Note: Remember to write your own \"OPENING MESSAGE\" before you copy and paste the template. please always include [TREASURE HARUTO] for example:"
|
||||
},
|
||||
{
|
||||
"type": "UncategorizedText",
|
||||
"element_id": "23a37aa2d5f39d5e2275dec011be76be",
|
||||
"type": "NarrativeText",
|
||||
"element_id": "9854d0df4dceac0f09846f4f272f0703",
|
||||
"metadata": {
|
||||
"data_source": {},
|
||||
"filetype": "image/png"
|
||||
},
|
||||
"text": "StS ofAl2, AS|E YGEAS 1B TREASUREMH HARUTOM| 2] BHEYLICH, BHO AY, HARUTO M BE = WSO Hol Wat SSGstsS LRU, O| Wil BS SH ASP ASS AZopO} HAS] TAI St a Bat FSAel GAS HS + U7/S HLICh."
|
||||
"text": "안녕하세요, 저 희 는 YGEAS 그룹 TREASUREMH HARUTOM| 2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 메 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 고 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다."
|
||||
},
|
||||
{
|
||||
"type": "NarrativeText",
|
||||
|
||||
@ -10,11 +10,14 @@ PYTHONPATH=. ./unstructured/ingest/main.py \
|
||||
--local-input-path example-docs/english-and-korean.png \
|
||||
--structured-output-dir parameterized-ingest-output \
|
||||
--partition-ocr-languages eng+kor \
|
||||
--partition-strategy ocr_only \
|
||||
--verbose \
|
||||
--reprocess
|
||||
|
||||
set +e
|
||||
|
||||
OVERWRITE_FIXTURES=${OVERWRITE_FIXTURES:-false}
|
||||
|
||||
# to update ingest test fixtures, run scripts/ingest-test-fixtures-update.sh on x86_64
|
||||
if [[ "$OVERWRITE_FIXTURES" != "false" ]]; then
|
||||
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user