2024-05-09 16:31:35 -05:00
|
|
|
from __future__ import annotations
|
|
|
|
|
2023-05-04 16:23:51 -04:00
|
|
|
import os
|
|
|
|
import pathlib
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
import tempfile
|
2023-02-27 17:30:54 +01:00
|
|
|
from unittest import mock
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
import pytest
|
2023-07-05 11:25:11 -07:00
|
|
|
from PIL import Image
|
2023-04-21 09:41:26 -04:00
|
|
|
from pytesseract import TesseractError
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured_inference.inference import layout
|
2023-01-13 22:24:13 -06:00
|
|
|
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
from test_unstructured.partition.pdf_image.test_pdf import assert_element_extraction
|
2023-10-12 12:47:55 -07:00
|
|
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-11-08 21:52:55 -08:00
|
|
|
from unstructured.documents.elements import ElementType
|
2023-12-15 14:29:58 -08:00
|
|
|
from unstructured.partition import image, pdf
|
|
|
|
from unstructured.partition.pdf_image import ocr
|
2023-11-15 21:41:02 -08:00
|
|
|
from unstructured.partition.utils.constants import (
|
|
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
|
|
|
|
PartitionStrategy,
|
|
|
|
)
|
2023-10-25 00:19:51 -05:00
|
|
|
from unstructured.utils import only
|
2023-01-13 22:24:13 -06:00
|
|
|
|
2023-05-04 16:23:51 -04:00
|
|
|
DIRECTORY = pathlib.Path(__file__).parent.resolve()
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
|
|
|
|
class MockResponse:
|
|
|
|
def __init__(self, status_code, response):
|
|
|
|
self.status_code = status_code
|
|
|
|
self.response = response
|
|
|
|
|
|
|
|
def json(self):
|
|
|
|
return self.response
|
|
|
|
|
|
|
|
|
|
|
|
def mock_healthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=200, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unhealthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unsuccessful_post(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_successful_post(url, **kwargs):
|
|
|
|
response = {
|
|
|
|
"pages": [
|
|
|
|
{
|
|
|
|
"number": 0,
|
2023-07-26 15:10:14 -04:00
|
|
|
"elements": [
|
|
|
|
{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"},
|
|
|
|
],
|
2023-02-08 10:11:15 -05:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"number": 1,
|
|
|
|
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
|
|
|
},
|
2023-02-27 17:30:54 +01:00
|
|
|
],
|
2023-01-13 22:24:13 -06:00
|
|
|
}
|
|
|
|
return MockResponse(status_code=200, response=response)
|
|
|
|
|
|
|
|
|
|
|
|
class MockPageLayout(layout.PageLayout):
|
2023-07-05 11:25:11 -07:00
|
|
|
def __init__(self, number: int, image: Image):
|
|
|
|
self.number = number
|
|
|
|
self.image = image
|
2023-10-30 01:10:51 -06:00
|
|
|
self.elements = [
|
2023-10-12 20:28:46 -05:00
|
|
|
layout.LayoutElement.from_coords(
|
2023-01-13 22:24:13 -06:00
|
|
|
type="Title",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=0,
|
|
|
|
y1=0,
|
|
|
|
x2=2,
|
|
|
|
y2=2,
|
2023-01-13 22:24:13 -06:00
|
|
|
text="Charlie Brown and the Great Pumpkin",
|
2023-02-27 17:30:54 +01:00
|
|
|
),
|
2023-01-13 22:24:13 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
|
|
@property
|
|
|
|
def pages(self):
|
|
|
|
return [
|
2023-07-05 11:25:11 -07:00
|
|
|
MockPageLayout(number=0, image=Image.new("1", (1, 1))),
|
2023-01-13 22:24:13 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
2023-02-27 17:30:54 +01:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "file"),
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
[
|
|
|
|
("example-docs/example.jpg", None),
|
|
|
|
(None, b"0000"),
|
|
|
|
],
|
2023-02-27 17:30:54 +01:00
|
|
|
)
|
2023-01-13 22:24:13 -06:00
|
|
|
def test_partition_image_local(monkeypatch, filename, file):
|
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_data_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-13 22:24:13 -06:00
|
|
|
)
|
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-13 22:24:13 -06:00
|
|
|
)
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
monkeypatch.setattr(
|
|
|
|
ocr,
|
|
|
|
"process_data_with_ocr",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
|
|
)
|
|
|
|
monkeypatch.setattr(
|
|
|
|
ocr,
|
|
|
|
"process_data_with_ocr",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
|
|
)
|
2023-01-13 22:24:13 -06:00
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
partition_image_response = pdf._partition_pdf_or_image_local(
|
|
|
|
filename,
|
|
|
|
file,
|
|
|
|
is_image=True,
|
|
|
|
)
|
2023-01-13 22:24:13 -06:00
|
|
|
assert partition_image_response[0].text == "Charlie Brown and the Great Pumpkin"
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skip("Needs to be fixed upstream in unstructured-inference")
|
|
|
|
def test_partition_image_local_raises_with_no_filename():
|
|
|
|
with pytest.raises(FileNotFoundError):
|
|
|
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=True)
|
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_image_with_auto_strategy(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.jpg",
|
|
|
|
):
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
|
2023-11-08 21:52:55 -08:00
|
|
|
titles = [
|
|
|
|
el for el in elements if el.category == ElementType.TITLE and len(el.text.split(" ")) > 10
|
|
|
|
]
|
2023-05-12 13:45:08 -04:00
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
idx = 3
|
2023-05-12 13:45:08 -04:00
|
|
|
assert titles[0].text == title
|
2023-10-05 00:41:38 -07:00
|
|
|
assert elements[idx].metadata.detection_class_prob is not None
|
|
|
|
assert isinstance(elements[idx].metadata.detection_class_prob, float)
|
2023-05-12 13:45:08 -04:00
|
|
|
|
|
|
|
|
2023-07-27 13:33:36 -04:00
|
|
|
def test_partition_image_with_table_extraction(
|
|
|
|
filename="example-docs/layout-parser-paper-with-table.jpg",
|
|
|
|
):
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-07-27 13:33:36 -04:00
|
|
|
infer_table_structure=True,
|
|
|
|
)
|
|
|
|
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
|
|
|
assert len(table) == 1
|
2024-06-14 11:11:38 -07:00
|
|
|
assert "<table><thead><tr>" in table[0]
|
|
|
|
assert "</thead><tbody><tr>" in table[0]
|
2023-07-27 13:33:36 -04:00
|
|
|
|
|
|
|
|
2023-08-24 11:12:50 -04:00
|
|
|
def test_partition_image_with_multipage_tiff(
|
|
|
|
filename="example-docs/layout-parser-paper-combined.tiff",
|
|
|
|
):
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
|
2023-08-24 11:12:50 -04:00
|
|
|
assert elements[-1].metadata.page_number == 2
|
|
|
|
|
|
|
|
|
2024-01-17 17:50:36 -05:00
|
|
|
def test_partition_image_with_bmp(
|
|
|
|
tmpdir,
|
|
|
|
filename="example-docs/layout-parser-paper-with-table.jpg",
|
|
|
|
):
|
|
|
|
bmp_filename = os.path.join(tmpdir.dirname, "example.bmp")
|
|
|
|
img = Image.open(filename)
|
|
|
|
img.save(bmp_filename)
|
|
|
|
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=bmp_filename,
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
infer_table_structure=True,
|
|
|
|
)
|
|
|
|
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
|
|
|
assert len(table) == 1
|
2024-06-14 11:11:38 -07:00
|
|
|
assert "<table><thead><tr>" in table[0]
|
|
|
|
assert "</thead><tbody><tr>" in table[0]
|
2024-01-17 17:50:36 -05:00
|
|
|
|
|
|
|
|
2023-04-21 09:41:26 -04:00
|
|
|
def test_partition_image_with_language_passed(filename="example-docs/example.jpg"):
|
2023-07-26 15:10:14 -04:00
|
|
|
with mock.patch.object(
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
ocr,
|
|
|
|
"process_file_with_ocr",
|
2023-07-26 15:10:14 -04:00
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_partition:
|
|
|
|
image.partition_image(
|
|
|
|
filename=filename,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-07-26 15:10:14 -04:00
|
|
|
ocr_languages="eng+swe",
|
|
|
|
)
|
2023-04-21 09:41:26 -04:00
|
|
|
|
|
|
|
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
|
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_image_from_file_with_language_passed(
|
|
|
|
filename="example-docs/example.jpg",
|
|
|
|
):
|
|
|
|
with mock.patch.object(
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
ocr,
|
|
|
|
"process_data_with_ocr",
|
2023-07-26 15:10:14 -04:00
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_partition, open(filename, "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
image.partition_image(file=f, strategy=PartitionStrategy.HI_RES, ocr_languages="eng+swe")
|
2023-04-21 09:41:26 -04:00
|
|
|
|
|
|
|
assert mock_partition.call_args.kwargs.get("ocr_languages") == "eng+swe"
|
|
|
|
|
|
|
|
|
2023-08-10 13:57:46 -07:00
|
|
|
# NOTE(crag): see https://github.com/Unstructured-IO/unstructured/issues/1086
|
|
|
|
@pytest.mark.skip(reason="Current catching too many tesseract errors")
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_image_raises_with_invalid_language(
|
|
|
|
filename="example-docs/example.jpg",
|
|
|
|
):
|
2023-04-21 09:41:26 -04:00
|
|
|
with pytest.raises(TesseractError):
|
2023-07-26 15:10:14 -04:00
|
|
|
image.partition_image(
|
|
|
|
filename=filename,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-07-26 15:10:14 -04:00
|
|
|
ocr_languages="fakeroo",
|
|
|
|
)
|
2023-05-04 16:23:51 -04:00
|
|
|
|
|
|
|
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("strategy"),
|
|
|
|
[
|
2023-11-15 21:41:02 -08:00
|
|
|
(PartitionStrategy.HI_RES),
|
|
|
|
(PartitionStrategy.OCR_ONLY),
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_image_strategies_keep_languages_metadata(strategy):
|
|
|
|
filename = os.path.join(
|
|
|
|
DIRECTORY,
|
|
|
|
"..",
|
|
|
|
"..",
|
|
|
|
"..",
|
|
|
|
"example-docs",
|
|
|
|
"english-and-korean.png",
|
|
|
|
)
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
|
|
|
languages=["eng", "kor"],
|
|
|
|
strategy=strategy,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[0].metadata.languages == ["eng", "kor"]
|
|
|
|
|
|
|
|
|
2023-05-04 16:23:51 -04:00
|
|
|
def test_partition_image_with_ocr_detects_korean():
|
2023-07-26 15:10:14 -04:00
|
|
|
filename = os.path.join(
|
|
|
|
DIRECTORY,
|
|
|
|
"..",
|
|
|
|
"..",
|
2023-08-19 12:56:13 -04:00
|
|
|
"..",
|
2023-07-26 15:10:14 -04:00
|
|
|
"example-docs",
|
|
|
|
"english-and-korean.png",
|
|
|
|
)
|
2023-05-04 16:23:51 -04:00
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
|
|
|
ocr_languages="eng+kor",
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
2023-05-04 16:23:51 -04:00
|
|
|
)
|
|
|
|
|
2023-08-25 00:32:12 -05:00
|
|
|
assert elements[0].text == "RULES AND INSTRUCTIONS"
|
2023-05-15 13:23:19 -05:00
|
|
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
2023-05-04 16:23:51 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_with_ocr_detects_korean_from_file():
|
2023-08-19 12:56:13 -04:00
|
|
|
filename = os.path.join(DIRECTORY, "..", "..", "..", "example-docs", "english-and-korean.png")
|
2023-05-04 16:23:51 -04:00
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(
|
|
|
|
file=f,
|
|
|
|
ocr_languages="eng+kor",
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
2023-05-04 16:23:51 -04:00
|
|
|
)
|
|
|
|
|
2023-08-25 00:32:12 -05:00
|
|
|
assert elements[0].text == "RULES AND INSTRUCTIONS"
|
2023-05-15 13:23:19 -05:00
|
|
|
assert elements[3].text.replace(" ", "").startswith("안녕하세요")
|
2023-05-04 16:23:51 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_raises_with_bad_strategy():
|
2023-07-26 15:10:14 -04:00
|
|
|
filename = os.path.join(
|
|
|
|
DIRECTORY,
|
|
|
|
"..",
|
|
|
|
"..",
|
2023-08-19 12:56:13 -04:00
|
|
|
"..",
|
2023-07-26 15:10:14 -04:00
|
|
|
"example-docs",
|
|
|
|
"english-and-korean.png",
|
|
|
|
)
|
2023-05-04 16:23:51 -04:00
|
|
|
with pytest.raises(ValueError):
|
|
|
|
image.partition_image(filename=filename, strategy="fakeroo")
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2023-08-02 09:22:20 -07:00
|
|
|
def test_partition_image_default_strategy_hi_res():
|
2023-08-19 12:56:13 -04:00
|
|
|
filename = os.path.join(
|
|
|
|
DIRECTORY,
|
|
|
|
"..",
|
|
|
|
"..",
|
|
|
|
"..",
|
|
|
|
"example-docs",
|
|
|
|
"layout-parser-paper-fast.jpg",
|
|
|
|
)
|
2023-08-02 09:22:20 -07:00
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(file=f)
|
|
|
|
|
2023-10-05 00:41:38 -07:00
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
2024-01-29 10:44:41 -06:00
|
|
|
idx = 2
|
2023-10-05 00:41:38 -07:00
|
|
|
assert elements[idx].text == title
|
|
|
|
assert elements[idx].metadata.coordinates is not None
|
|
|
|
assert elements[idx].metadata.detection_class_prob is not None
|
|
|
|
assert isinstance(elements[idx].metadata.detection_class_prob, float)
|
2023-10-05 15:26:47 -05:00
|
|
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
2023-10-30 01:10:51 -06:00
|
|
|
# A bug in partition_groups_from_regions in unstructured-inference losses some sources
|
2023-10-30 13:13:29 -07:00
|
|
|
assert {element.metadata.detection_origin for element in elements} == {
|
|
|
|
"yolox",
|
|
|
|
"ocr_tesseract",
|
|
|
|
}
|
2023-08-02 09:22:20 -07:00
|
|
|
|
|
|
|
|
2023-07-26 15:10:14 -04:00
|
|
|
def test_partition_image_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
|
2023-07-26 15:10:14 -04:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
elements = image.partition_image(filename=filename)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_with_hi_res_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
|
2023-07-26 15:10:14 -04:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
2024-03-18 02:09:44 +01:00
|
|
|
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_metadata_date_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
|
2023-07-26 15:10:14 -04:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_with_hi_res_strategy_metadata_date_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
|
2023-07-26 15:10:14 -04:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
2024-03-18 02:09:44 +01:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
|
2023-07-26 15:10:14 -04:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(file=f)
|
|
|
|
|
2024-03-18 02:09:44 +01:00
|
|
|
assert elements[0].metadata.last_modified is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_explicit_get_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
|
2024-03-18 02:09:44 +01:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(file=f, date_from_file_object=True)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_with_hi_res_strategy_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
|
2023-07-26 15:10:14 -04:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
2024-03-18 02:09:44 +01:00
|
|
|
elements = image.partition_image(file=f, strategy=PartitionStrategy.HI_RES)
|
|
|
|
|
|
|
|
assert elements[0].metadata.last_modified is None
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_with_hi_res_strategy_explicit_get_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
|
2024-03-18 02:09:44 +01:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(
|
|
|
|
file=f, strategy=PartitionStrategy.HI_RES, date_from_file_object=True
|
|
|
|
)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_metadata_date_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
|
2023-07-26 15:10:14 -04:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(
|
|
|
|
file=f,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_from_file_with_hi_res_strategy_metadata_date_custom_metadata_date(
|
|
|
|
mocker,
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
|
|
|
expected_last_modification_date = "2009-07-05T09:24:28"
|
|
|
|
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
|
2023-07-26 15:10:14 -04:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(
|
|
|
|
file=f,
|
2023-07-31 19:55:43 -07:00
|
|
|
metadata_last_modified=expected_last_modification_date,
|
2024-03-18 02:09:44 +01:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
|
|
|
|
2023-07-31 19:55:43 -07:00
|
|
|
assert elements[0].metadata.last_modified == expected_last_modification_date
|
2023-08-25 00:32:12 -05:00
|
|
|
|
|
|
|
|
2024-03-18 02:09:44 +01:00
|
|
|
def test_partition_image_from_file_without_metadata_date(
|
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
|
|
|
"""Test partition_image() with file that are not possible to get last modified date"""
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
sf = tempfile.SpooledTemporaryFile()
|
|
|
|
sf.write(f.read())
|
|
|
|
sf.seek(0)
|
|
|
|
elements = image.partition_image(file=sf, date_from_file_object=True)
|
|
|
|
|
|
|
|
assert elements[0].metadata.last_modified is None
|
|
|
|
|
|
|
|
|
2023-10-12 12:47:55 -07:00
|
|
|
def test_partition_msg_with_json():
|
|
|
|
elements = image.partition_image(
|
2023-10-12 17:33:10 -04:00
|
|
|
example_doc_path("layout-parser-paper-fast.jpg"),
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.AUTO,
|
2023-10-12 12:47:55 -07:00
|
|
|
)
|
|
|
|
assert_round_trips_through_JSON(elements)
|
2023-08-29 16:59:26 -04:00
|
|
|
|
|
|
|
|
2023-09-15 15:11:16 -05:00
|
|
|
def test_partition_image_with_ocr_has_coordinates_from_filename(
|
2023-08-25 00:32:12 -05:00
|
|
|
filename="example-docs/english-and-korean.png",
|
|
|
|
):
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.OCR_ONLY)
|
2023-08-25 00:32:12 -05:00
|
|
|
int_coordinates = [(int(x), int(y)) for x, y in elements[0].metadata.coordinates.points]
|
2023-10-30 13:13:29 -07:00
|
|
|
assert int_coordinates == [(14, 16), (14, 37), (381, 37), (381, 16)]
|
2023-09-11 16:00:14 -05:00
|
|
|
|
|
|
|
|
2023-09-15 15:11:16 -05:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename"),
|
|
|
|
[
|
|
|
|
("example-docs/layout-parser-paper-with-table.jpg"),
|
|
|
|
("example-docs/english-and-korean.png"),
|
|
|
|
("example-docs/layout-parser-paper-fast.jpg"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_image_with_ocr_coordinates_are_not_nan_from_filename(
|
|
|
|
filename,
|
|
|
|
):
|
|
|
|
import math
|
|
|
|
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.OCR_ONLY)
|
2023-09-15 15:11:16 -05:00
|
|
|
for element in elements:
|
|
|
|
# TODO (jennings) One or multiple elements is an empty string
|
|
|
|
# without coordinates. This should be fixed in a new issue
|
|
|
|
if element.text:
|
|
|
|
box = element.metadata.coordinates.points
|
|
|
|
for point in box:
|
|
|
|
assert point[0] is not math.nan
|
|
|
|
assert point[1] is not math.nan
|
|
|
|
|
|
|
|
|
2023-09-18 11:42:02 -04:00
|
|
|
def test_partition_image_formats_languages_for_tesseract():
|
|
|
|
filename = "example-docs/jpn-vert.jpeg"
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
with mock.patch(
|
2023-12-01 12:56:31 -08:00
|
|
|
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
) as mock_process_file_with_ocr:
|
2023-11-15 21:41:02 -08:00
|
|
|
image.partition_image(
|
|
|
|
filename=filename, strategy=PartitionStrategy.HI_RES, languages=["jpn_vert"]
|
|
|
|
)
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
_, kwargs = mock_process_file_with_ocr.call_args_list[0]
|
|
|
|
assert "ocr_languages" in kwargs
|
|
|
|
assert kwargs["ocr_languages"] == "jpn_vert"
|
2023-09-18 11:42:02 -04:00
|
|
|
|
|
|
|
|
2023-09-13 00:11:58 -04:00
|
|
|
def test_partition_image_warns_with_ocr_languages(caplog):
|
|
|
|
filename = "example-docs/layout-parser-paper-fast.jpg"
|
2023-11-15 21:41:02 -08:00
|
|
|
image.partition_image(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
|
2023-09-13 00:11:58 -04:00
|
|
|
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
|
|
|
|
|
|
|
|
2023-09-11 16:00:14 -05:00
|
|
|
def test_add_chunking_strategy_on_partition_image(
|
|
|
|
filename="example-docs/layout-parser-paper-fast.jpg",
|
|
|
|
):
|
|
|
|
elements = image.partition_image(filename=filename)
|
|
|
|
chunk_elements = image.partition_image(filename, chunking_strategy="by_title")
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-09-15 15:09:58 -05:00
|
|
|
|
|
|
|
|
2023-10-03 09:40:34 -07:00
|
|
|
def test_add_chunking_strategy_on_partition_image_hi_res(
|
|
|
|
filename="example-docs/layout-parser-paper-with-table.jpg",
|
|
|
|
):
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-10-03 09:40:34 -07:00
|
|
|
infer_table_structure=True,
|
|
|
|
)
|
|
|
|
chunk_elements = image.partition_image(
|
|
|
|
filename,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-10-03 09:40:34 -07:00
|
|
|
infer_table_structure=True,
|
|
|
|
chunking_strategy="by_title",
|
|
|
|
)
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
|
|
|
|
|
|
|
|
2023-09-15 15:09:58 -05:00
|
|
|
def test_partition_image_uses_model_name():
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_local",
|
|
|
|
) as mockpartition:
|
|
|
|
image.partition_image("example-docs/layout-parser-paper-fast.jpg", model_name="test")
|
|
|
|
print(mockpartition.call_args)
|
|
|
|
assert "model_name" in mockpartition.call_args.kwargs
|
|
|
|
assert mockpartition.call_args.kwargs["model_name"]
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
|
|
|
|
|
2023-12-22 09:06:54 -06:00
|
|
|
def test_partition_image_uses_hi_res_model_name():
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_local",
|
|
|
|
) as mockpartition:
|
|
|
|
image.partition_image("example-docs/layout-parser-paper-fast.jpg", hi_res_model_name="test")
|
|
|
|
print(mockpartition.call_args)
|
|
|
|
assert "model_name" not in mockpartition.call_args.kwargs
|
|
|
|
assert "hi_res_model_name" in mockpartition.call_args.kwargs
|
|
|
|
assert mockpartition.call_args.kwargs["hi_res_model_name"] == "test"
|
|
|
|
|
|
|
|
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("ocr_mode", "idx_title_element"),
|
|
|
|
[
|
2024-01-29 10:44:41 -06:00
|
|
|
("entire_page", 2),
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
("individual_blocks", 1),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_image_hi_res_ocr_mode(ocr_mode, idx_title_element):
|
|
|
|
filename = "example-docs/layout-parser-paper-fast.jpg"
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename, ocr_mode=ocr_mode, strategy=PartitionStrategy.HI_RES
|
|
|
|
)
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
# Note(yuming): idx_title_element is different based on xy-cut and ocr mode
|
2023-11-09 10:29:55 -08:00
|
|
|
assert elements[idx_title_element].category == ElementType.TITLE
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_hi_res_invalid_ocr_mode():
|
|
|
|
filename = "example-docs/layout-parser-paper-fast.jpg"
|
|
|
|
with pytest.raises(ValueError):
|
2023-11-15 21:41:02 -08:00
|
|
|
_ = image.partition_image(
|
|
|
|
filename=filename, ocr_mode="invalid_ocr_mode", strategy=PartitionStrategy.HI_RES
|
|
|
|
)
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("ocr_mode"),
|
|
|
|
[
|
|
|
|
("entire_page"),
|
|
|
|
("individual_blocks"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_image_hi_res_ocr_mode_with_table_extraction(ocr_mode):
|
|
|
|
filename = "example-docs/layout-parser-paper-with-table.jpg"
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
|
|
|
ocr_mode=ocr_mode,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
infer_table_structure=True,
|
|
|
|
)
|
|
|
|
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
|
|
|
assert len(table) == 1
|
2024-06-14 11:11:38 -07:00
|
|
|
assert "<table><thead><tr>" in table[0]
|
|
|
|
assert "</thead><tbody><tr>" in table[0]
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
assert "Layouts of history Japanese documents" in table[0]
|
2023-10-24 13:13:28 -04:00
|
|
|
assert "Layouts of scanned modern magazines and scientific reports" in table[0]
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
|
|
|
|
|
2023-10-10 20:47:56 -05:00
|
|
|
def test_partition_image_raises_TypeError_for_invalid_languages():
|
|
|
|
filename = "example-docs/layout-parser-paper-fast.jpg"
|
|
|
|
with pytest.raises(TypeError):
|
2023-11-15 21:41:02 -08:00
|
|
|
image.partition_image(filename=filename, strategy=PartitionStrategy.HI_RES, languages="eng")
|
2023-10-25 00:19:51 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def inference_results():
|
|
|
|
page = layout.PageLayout(
|
|
|
|
number=1,
|
|
|
|
image=mock.MagicMock(format="JPEG"),
|
|
|
|
)
|
|
|
|
page.elements = [layout.LayoutElement.from_coords(0, 0, 600, 800, text="hello")]
|
|
|
|
doc = layout.DocumentLayout(pages=[page])
|
|
|
|
return doc
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_has_filename(inference_results):
|
|
|
|
doc_path = "example-docs"
|
|
|
|
filename = "layout-parser-paper-fast.jpg"
|
|
|
|
# Mock inference call with known return results
|
|
|
|
with mock.patch(
|
|
|
|
"unstructured_inference.inference.layout.process_file_with_model",
|
|
|
|
return_value=inference_results,
|
|
|
|
) as mock_inference_func:
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=os.path.join(doc_path, filename),
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-10-25 00:19:51 -05:00
|
|
|
)
|
|
|
|
# Make sure we actually went down the path we expect.
|
|
|
|
mock_inference_func.assert_called_once()
|
|
|
|
# Unpack element but also make sure there is only one
|
|
|
|
element = only(elements)
|
|
|
|
# This makes sure we are still getting the filetype metadata (should be translated from the
|
|
|
|
# fixtures)
|
|
|
|
assert element.metadata.filetype == "JPEG"
|
|
|
|
# This should be kept from the filename we originally gave
|
|
|
|
assert element.metadata.filename == filename
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
2024-01-04 09:52:00 -08:00
|
|
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
def test_partition_image_element_extraction(
|
|
|
|
file_mode,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_to_payload,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
filename=example_doc_path("embedded-images-tables.jpg"),
|
|
|
|
):
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_types = ["Image", "Table"]
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
if file_mode == "filename":
|
|
|
|
elements = image.partition_image(
|
|
|
|
filename=filename,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
|
|
extract_image_block_output_dir=tmpdir,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
)
|
|
|
|
else:
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = image.partition_image(
|
|
|
|
file=f,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
|
|
extract_image_block_output_dir=tmpdir,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
)
|
|
|
|
|
2024-01-04 09:52:00 -08:00
|
|
|
assert_element_extraction(
|
|
|
|
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
|
|
|
)
|
2024-01-29 22:49:00 -06:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_image_works_on_heic_file(
|
|
|
|
filename="example-docs/DA-1p.heic",
|
|
|
|
):
|
|
|
|
elements = image.partition_image(filename=filename, strategy=PartitionStrategy.AUTO)
|
|
|
|
titles = [el.text for el in elements if el.category == ElementType.TITLE]
|
|
|
|
assert "CREATURES" in titles
|
2024-05-09 16:31:35 -05:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("strategy"),
|
|
|
|
[PartitionStrategy.HI_RES, PartitionStrategy.OCR_ONLY],
|
|
|
|
)
|
|
|
|
def test_deterministic_element_ids(strategy: str):
|
|
|
|
elements_1 = image.partition_image(
|
|
|
|
example_doc_path("layout-parser-paper-with-table.jpg"),
|
|
|
|
strategy=strategy,
|
|
|
|
starting_page_number=2,
|
|
|
|
)
|
|
|
|
elements_2 = image.partition_image(
|
|
|
|
example_doc_path("layout-parser-paper-with-table.jpg"),
|
|
|
|
strategy=strategy,
|
|
|
|
starting_page_number=2,
|
|
|
|
)
|
|
|
|
ids_1 = [element.id for element in elements_1]
|
|
|
|
ids_2 = [element.id for element in elements_2]
|
|
|
|
|
|
|
|
assert ids_1 == ids_2
|
|
|
|
|
|
|
|
|
|
|
|
def test_multipage_tiff_starts_on_starting_page_number():
|
|
|
|
elements = image.partition_image(
|
|
|
|
example_doc_path("layout-parser-paper-combined.tiff"),
|
|
|
|
starting_page_number=2,
|
|
|
|
)
|
|
|
|
pages = {element.metadata.page_number for element in elements}
|
|
|
|
|
|
|
|
assert pages == {2, 3}
|