Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
import base64
|
2023-11-29 14:00:15 -05:00
|
|
|
import logging
|
2023-10-30 13:13:29 -07:00
|
|
|
import math
|
2023-04-21 17:35:43 -04:00
|
|
|
import os
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
import tempfile
|
2024-06-26 15:14:55 +02:00
|
|
|
from pathlib import Path
|
2023-05-09 21:39:07 -07:00
|
|
|
from tempfile import SpooledTemporaryFile
|
2023-02-27 17:30:54 +01:00
|
|
|
from unittest import mock
|
|
|
|
|
2022-11-30 16:34:24 -05:00
|
|
|
import pytest
|
2023-12-01 12:56:31 -08:00
|
|
|
from pdf2image.exceptions import PDFPageCountError
|
2023-07-05 11:25:11 -07:00
|
|
|
from PIL import Image
|
2023-02-27 17:30:54 +01:00
|
|
|
from unstructured_inference.inference import layout
|
2022-11-30 16:34:24 -05:00
|
|
|
|
2023-10-12 12:47:55 -07:00
|
|
|
from test_unstructured.unit_utils import assert_round_trips_through_JSON, example_doc_path
|
2023-09-11 16:00:14 -05:00
|
|
|
from unstructured.chunking.title import chunk_by_title
|
2023-06-20 11:19:55 -05:00
|
|
|
from unstructured.documents.coordinates import PixelSpace
|
2023-07-05 11:25:11 -07:00
|
|
|
from unstructured.documents.elements import (
|
|
|
|
CoordinatesMetadata,
|
|
|
|
ElementMetadata,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
ElementType,
|
2024-02-23 11:56:09 -05:00
|
|
|
Footer,
|
|
|
|
Header,
|
feat: pdf auto strategy groups broken numbered and bullet list items(#1393)
**Summary**
Adds logic to combine broken numbered list for pdf fast strategy.
**Details**
Previously the document reads the numbered list items part of the
`layout-parser-paper-fast.pdf` file as:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character'
'recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that'
'underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model'
'tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
Now it reads:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model' tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
The added logic leverages `ElementType` and `coordinates` to determine
whether the following lines is a part of the previously detected
`ListItem` or not.
**Test**
Add test that checks the element length less than original version with
broken numbered list. The test also checks whether the first detected
numbered list ends with previously broken line.
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-13 17:30:06 -04:00
|
|
|
ListItem,
|
2023-07-05 11:25:11 -07:00
|
|
|
NarrativeText,
|
|
|
|
Text,
|
|
|
|
Title,
|
|
|
|
)
|
2023-12-15 14:29:58 -08:00
|
|
|
from unstructured.partition import pdf, strategies
|
|
|
|
from unstructured.partition.pdf import get_uris_from_annots
|
|
|
|
from unstructured.partition.pdf_image import ocr, pdfminer_processing
|
2023-11-15 21:41:02 -08:00
|
|
|
from unstructured.partition.utils.constants import (
|
|
|
|
UNSTRUCTURED_INCLUDE_DEBUG_METADATA,
|
|
|
|
PartitionStrategy,
|
|
|
|
)
|
2022-11-21 17:27:23 -05:00
|
|
|
|
|
|
|
|
2022-11-30 16:34:24 -05:00
|
|
|
class MockResponse:
|
|
|
|
def __init__(self, status_code, response):
|
|
|
|
self.status_code = status_code
|
|
|
|
self.response = response
|
|
|
|
|
|
|
|
def json(self):
|
|
|
|
return self.response
|
|
|
|
|
|
|
|
|
|
|
|
def mock_healthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=200, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unhealthy_get(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_unsuccessful_post(url, **kwargs):
|
|
|
|
return MockResponse(status_code=500, response={})
|
|
|
|
|
|
|
|
|
|
|
|
def mock_successful_post(url, **kwargs):
|
|
|
|
response = {
|
|
|
|
"pages": [
|
|
|
|
{
|
|
|
|
"number": 0,
|
2023-05-31 13:50:15 -05:00
|
|
|
"elements": [
|
|
|
|
{"type": "Title", "text": "Charlie Brown and the Great Pumpkin"},
|
|
|
|
],
|
2023-02-08 10:11:15 -05:00
|
|
|
},
|
|
|
|
{
|
|
|
|
"number": 1,
|
|
|
|
"elements": [{"type": "Title", "text": "A Charlie Brown Christmas"}],
|
|
|
|
},
|
2023-02-27 17:30:54 +01:00
|
|
|
],
|
2022-11-30 16:34:24 -05:00
|
|
|
}
|
|
|
|
return MockResponse(status_code=200, response=response)
|
|
|
|
|
|
|
|
|
2023-01-04 16:19:05 -06:00
|
|
|
class MockPageLayout(layout.PageLayout):
|
2023-07-05 11:25:11 -07:00
|
|
|
def __init__(self, number: int, image: Image):
|
|
|
|
self.number = number
|
|
|
|
self.image = image
|
2023-10-30 01:10:51 -06:00
|
|
|
self.elements = [
|
2023-10-12 20:28:46 -05:00
|
|
|
layout.LayoutElement.from_coords(
|
2023-01-04 16:19:05 -06:00
|
|
|
type="Title",
|
2023-04-04 19:59:06 -07:00
|
|
|
x1=0,
|
|
|
|
y1=0,
|
|
|
|
x2=2,
|
|
|
|
y2=2,
|
2023-01-04 16:19:05 -06:00
|
|
|
text="Charlie Brown and the Great Pumpkin",
|
2023-02-27 17:30:54 +01:00
|
|
|
),
|
2023-01-04 16:19:05 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class MockDocumentLayout(layout.DocumentLayout):
|
|
|
|
@property
|
|
|
|
def pages(self):
|
|
|
|
return [
|
2023-07-05 11:25:11 -07:00
|
|
|
MockPageLayout(number=0, image=Image.new("1", (1, 1))),
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
MockPageLayout(number=1, image=Image.new("1", (1, 1))),
|
2023-01-04 16:19:05 -06:00
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
2023-02-27 17:30:54 +01:00
|
|
|
("filename", "file"),
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
[
|
2023-11-03 08:02:43 -07:00
|
|
|
(example_doc_path("layout-parser-paper-fast.pdf"), None),
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
(None, b"0000"),
|
|
|
|
],
|
2023-01-04 16:19:05 -06:00
|
|
|
)
|
|
|
|
def test_partition_pdf_local(monkeypatch, filename, file):
|
2023-01-13 22:24:13 -06:00
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_data_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-13 22:24:13 -06:00
|
|
|
)
|
2023-01-04 16:19:05 -06:00
|
|
|
monkeypatch.setattr(
|
2023-02-27 17:30:54 +01:00
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
2023-01-04 16:19:05 -06:00
|
|
|
)
|
2023-12-01 12:56:31 -08:00
|
|
|
monkeypatch.setattr(
|
|
|
|
pdfminer_processing,
|
|
|
|
"process_data_with_pdfminer",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
|
|
)
|
|
|
|
monkeypatch.setattr(
|
|
|
|
pdfminer_processing,
|
|
|
|
"process_file_with_pdfminer",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
|
|
)
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
monkeypatch.setattr(
|
|
|
|
ocr,
|
|
|
|
"process_data_with_ocr",
|
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
|
|
)
|
|
|
|
monkeypatch.setattr(
|
|
|
|
ocr,
|
2023-12-01 12:56:31 -08:00
|
|
|
"process_file_with_ocr",
|
Refactor: support entire page OCR with `ocr_mode` and `ocr_languages` (#1579)
## Summary
Second part of OCR refactor to move it from inference repo to
unstructured repo, first part is done in
https://github.com/Unstructured-IO/unstructured-inference/pull/231. This
PR adds OCR process logics to entire page OCR, and support two OCR
modes, "entire_page" or "individual_blocks".
The updated workflow for `Hi_res` partition:
* pass the document as data/filename to inference repo to get
`inferred_layout` (DocumentLayout)
* pass the document as data/filename to OCR module, which first open the
document (create temp file/dir as needed), and split the document by
pages (convert PDF pages to image pages for PDF file)
* if ocr mode is `"entire_page"`
* OCR the entire image
* merge the OCR layout with inferred page layout
* if ocr mode is `"individual_blocks"`
* from inferred page layout, find element with no extracted text, crop
the entire image by the bboxes of the element
* replace empty text element with the text obtained from OCR the cropped
image
* return all merged PageLayouts and form a DocumentLayout subject for
later on process
This PR also bump `unstructured-inference==0.7.2` since the branch relay
on OCR refactor from unstructured-inference.
## Test
```
from unstructured.partition.auto import partition
entrie_page_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="entire_page", ocr_languages="eng+kor", strategy="hi_res")
individual_blocks_ocr_mode_elements = partition(filename="example-docs/english-and-korean.png", ocr_mode="individual_blocks", ocr_languages="eng+kor", strategy="hi_res")
print([el.text for el in entrie_page_ocr_mode_elements])
print([el.text for el in individual_blocks_ocr_mode_elements])
```
latest output:
```
# entrie_page
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'accounts.', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASUREWH HARUTOM|2] 팬 입니다. 팬 으 로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 불 공 평 함 을 LRU, 이 일 을 통해 저 희 의 의 혹 을 전 달 하여 귀 사 의 진지한 민 과 적극적인 답 변 을 받을 수 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were', 'successfully sent', '4. Use the hashtag of Haruto on your tweet to show that vou have sent vour email]', '메 고']
# individual_blocks
['RULES AND INSTRUCTIONS 1. Template for day 1 (korean) , for day 2 (English) for day 3 both English and korean. 2. Use all your accounts. use different emails to send. Its better to have many email', 'Note: Remember to write your own "OPENING MESSAGE" before you copy and paste the template. please always include [TREASURE HARUTO] for example:', '안녕하세요, 저 희 는 YGEAS 그룹 TREASURES HARUTOM| 2] 팬 입니다. 팬 으로서, HARUTO 씨 받 는 대 우 에 대해 의 구 심 과 habe ERO, 이 머 일 을 적극 저 희 의 ASS 전 달 하여 귀 사 의 진지한 고 2 있 기 를 바랍니다.', '3. CC Harutonations@gmail.com so we can keep track of how many emails were ciiccecefisliy cant', 'VULLESSIULY Set 4. Use the hashtag of Haruto on your tweet to show that you have sent your email']
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: yuming-long <yuming-long@users.noreply.github.com>
Co-authored-by: christinestraub <christinemstraub@gmail.com>
Co-authored-by: christinestraub <christinestraub@users.noreply.github.com>
2023-10-06 18:54:49 -04:00
|
|
|
lambda *args, **kwargs: MockDocumentLayout(),
|
|
|
|
)
|
2023-01-04 16:19:05 -06:00
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
partition_pdf_response = pdf._partition_pdf_or_image_local(filename, file)
|
2023-01-04 16:19:05 -06:00
|
|
|
assert partition_pdf_response[0].text == "Charlie Brown and the Great Pumpkin"
|
|
|
|
|
|
|
|
|
2023-01-13 22:24:13 -06:00
|
|
|
def test_partition_pdf_local_raises_with_no_filename():
|
2023-12-01 12:56:31 -08:00
|
|
|
with pytest.raises((FileNotFoundError, PDFPageCountError)):
|
2023-01-13 22:24:13 -06:00
|
|
|
pdf._partition_pdf_or_image_local(filename="", file=None, is_image=False)
|
2022-11-30 16:34:24 -05:00
|
|
|
|
|
|
|
|
2023-08-30 18:34:55 -05:00
|
|
|
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
2023-05-09 21:39:07 -07:00
|
|
|
@pytest.mark.parametrize(
|
2024-04-15 23:03:42 +02:00
|
|
|
("strategy", "starting_page_number", "expected_page_numbers", "origin"),
|
2023-08-30 18:34:55 -05:00
|
|
|
# fast: can't capture the "intentionally left blank page" page
|
|
|
|
# others: will ignore the actual blank page
|
2023-10-30 01:10:51 -06:00
|
|
|
[
|
2024-04-15 23:03:42 +02:00
|
|
|
(PartitionStrategy.FAST, 1, {1, 4}, {"pdfminer"}),
|
|
|
|
(PartitionStrategy.FAST, 3, {3, 6}, {"pdfminer"}),
|
|
|
|
(PartitionStrategy.HI_RES, 4, {4, 6, 7}, {"yolox", "pdfminer"}),
|
|
|
|
(PartitionStrategy.OCR_ONLY, 1, {1, 3, 4}, {"ocr_tesseract"}),
|
2023-10-30 01:10:51 -06:00
|
|
|
],
|
2023-07-14 13:08:33 -07:00
|
|
|
)
|
2024-04-15 23:03:42 +02:00
|
|
|
def test_partition_pdf_outputs_valid_amount_of_elements_and_metadata_values(
|
2023-08-30 18:34:55 -05:00
|
|
|
file_mode,
|
2023-07-14 13:08:33 -07:00
|
|
|
strategy,
|
2024-04-15 23:03:42 +02:00
|
|
|
starting_page_number,
|
|
|
|
expected_page_numbers,
|
2023-10-05 15:26:47 -05:00
|
|
|
origin,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-with-empty-pages.pdf"),
|
2023-07-14 13:08:33 -07:00
|
|
|
):
|
|
|
|
# Test that the partition_pdf function can handle filename
|
2023-08-30 18:34:55 -05:00
|
|
|
def _test(result):
|
2023-05-09 21:39:07 -07:00
|
|
|
# validate that the result is a non-empty list of dicts
|
|
|
|
assert len(result) > 10
|
2023-05-30 15:10:14 -04:00
|
|
|
# check that the pdf has multiple different page numbers
|
2024-04-15 23:03:42 +02:00
|
|
|
assert {element.metadata.page_number for element in result} == expected_page_numbers
|
2023-10-05 15:26:47 -05:00
|
|
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
2023-10-30 01:10:51 -06:00
|
|
|
assert {element.metadata.detection_origin for element in result} == origin
|
2023-08-30 18:34:55 -05:00
|
|
|
|
|
|
|
if file_mode == "filename":
|
2024-04-15 23:03:42 +02:00
|
|
|
result = pdf.partition_pdf(
|
|
|
|
filename=filename, strategy=strategy, starting_page_number=starting_page_number
|
|
|
|
)
|
2023-08-30 18:34:55 -05:00
|
|
|
_test(result)
|
|
|
|
elif file_mode == "rb":
|
|
|
|
with open(filename, "rb") as f:
|
2024-04-15 23:03:42 +02:00
|
|
|
result = pdf.partition_pdf(
|
|
|
|
file=f, strategy=strategy, starting_page_number=starting_page_number
|
|
|
|
)
|
2023-08-30 18:34:55 -05:00
|
|
|
_test(result)
|
|
|
|
else:
|
|
|
|
with open(filename, "rb") as test_file:
|
|
|
|
spooled_temp_file = SpooledTemporaryFile()
|
|
|
|
spooled_temp_file.write(test_file.read())
|
|
|
|
spooled_temp_file.seek(0)
|
2024-04-15 23:03:42 +02:00
|
|
|
result = pdf.partition_pdf(
|
|
|
|
file=spooled_temp_file, strategy=strategy, starting_page_number=starting_page_number
|
|
|
|
)
|
2023-08-30 18:34:55 -05:00
|
|
|
_test(result)
|
2023-05-09 21:39:07 -07:00
|
|
|
|
|
|
|
|
2023-06-27 23:06:08 -05:00
|
|
|
@mock.patch.dict(os.environ, {"UNSTRUCTURED_HI_RES_MODEL_NAME": "checkbox"})
|
2023-07-07 11:16:55 -04:00
|
|
|
def test_partition_pdf_with_model_name_env_var(
|
2023-05-31 13:50:15 -05:00
|
|
|
monkeypatch,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-05-31 13:50:15 -05:00
|
|
|
):
|
2023-07-07 23:41:37 -05:00
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
2023-07-26 15:10:14 -04:00
|
|
|
with mock.patch.object(
|
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_process:
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-10-24 13:13:28 -04:00
|
|
|
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
2024-01-31 09:36:59 -08:00
|
|
|
@pytest.mark.parametrize("model_name", ["checkbox", "yolox", "chipper"])
|
2023-07-07 11:16:55 -04:00
|
|
|
def test_partition_pdf_with_model_name(
|
|
|
|
monkeypatch,
|
2024-01-31 09:36:59 -08:00
|
|
|
model_name,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-07-07 11:16:55 -04:00
|
|
|
):
|
2023-07-07 23:41:37 -05:00
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
2023-07-26 15:10:14 -04:00
|
|
|
with mock.patch.object(
|
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_process:
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(
|
2024-01-31 09:36:59 -08:00
|
|
|
filename=filename,
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
model_name=model_name,
|
2023-11-15 21:41:02 -08:00
|
|
|
)
|
2024-01-31 09:36:59 -08:00
|
|
|
assert mock_process.call_args[1]["model_name"] == model_name
|
|
|
|
|
|
|
|
with mock.patch.object(
|
|
|
|
layout,
|
|
|
|
"process_data_with_model",
|
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_process:
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
pdf.partition_pdf(
|
|
|
|
file=f,
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
model_name=model_name,
|
|
|
|
)
|
|
|
|
assert mock_process.call_args[1]["model_name"] == model_name
|
2023-07-07 11:16:55 -04:00
|
|
|
|
|
|
|
|
2023-12-22 09:06:54 -06:00
|
|
|
def test_partition_pdf_with_hi_res_model_name(
|
|
|
|
monkeypatch,
|
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
|
|
|
):
|
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
|
|
|
with mock.patch.object(
|
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_process:
|
|
|
|
pdf.partition_pdf(
|
|
|
|
filename=filename, strategy=PartitionStrategy.HI_RES, hi_res_model_name="checkbox"
|
|
|
|
)
|
|
|
|
# unstructured-ingest uses `model_name` instead of `hi_res_model_name`
|
|
|
|
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_or_image_with_hi_res_model_name(
|
|
|
|
monkeypatch,
|
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
|
|
|
):
|
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
|
|
|
with mock.patch.object(
|
|
|
|
layout,
|
|
|
|
"process_file_with_model",
|
|
|
|
mock.MagicMock(),
|
|
|
|
) as mock_process:
|
|
|
|
pdf.partition_pdf_or_image(
|
|
|
|
filename=filename, strategy=PartitionStrategy.HI_RES, hi_res_model_name="checkbox"
|
|
|
|
)
|
|
|
|
# unstructured-ingest uses `model_name` instead of `hi_res_model_name`
|
|
|
|
assert mock_process.call_args[1]["model_name"] == "checkbox"
|
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_auto_strategy(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-05-31 13:50:15 -05:00
|
|
|
):
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
|
2023-05-12 13:45:08 -04:00
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
2023-10-05 22:16:11 -07:00
|
|
|
assert elements[6].text == title
|
|
|
|
assert elements[6].metadata.filename == "layout-parser-paper-fast.pdf"
|
2023-11-03 08:02:43 -07:00
|
|
|
assert elements[6].metadata.file_directory == os.path.dirname(filename)
|
2023-05-12 13:45:08 -04:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_page_breaks(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-05-31 13:50:15 -05:00
|
|
|
):
|
2023-02-08 10:11:15 -05:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, include_page_breaks=True)
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
2023-02-08 10:11:15 -05:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_no_page_breaks(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-05-31 13:50:15 -05:00
|
|
|
):
|
2023-02-08 10:11:15 -05:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None)
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" not in [elem.category for elem in elements]
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_fast_strategy(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-05-31 13:50:15 -05:00
|
|
|
):
|
2024-04-15 23:03:42 +02:00
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename, url=None, strategy=PartitionStrategy.FAST, starting_page_number=3
|
|
|
|
)
|
2023-03-10 22:16:05 -05:00
|
|
|
assert len(elements) > 10
|
2023-05-30 15:10:14 -04:00
|
|
|
# check that the pdf has multiple different page numbers
|
2024-04-15 23:03:42 +02:00
|
|
|
assert {element.metadata.page_number for element in elements} == {3, 4}
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
|
2023-09-19 19:25:31 -07:00
|
|
|
def test_partition_pdf_with_fast_neg_coordinates():
|
2023-11-03 08:02:43 -07:00
|
|
|
filename = example_doc_path("negative-coords.pdf")
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
|
2023-09-19 19:25:31 -07:00
|
|
|
assert len(elements) == 5
|
|
|
|
assert elements[0].metadata.coordinates.points[0][0] < 0
|
|
|
|
assert elements[0].metadata.coordinates.points[1][0] < 0
|
|
|
|
|
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
def test_partition_pdf_with_fast_groups_text(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-05-31 13:50:15 -05:00
|
|
|
):
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.FAST)
|
2023-04-19 13:54:17 -04:00
|
|
|
|
|
|
|
first_narrative_element = None
|
|
|
|
for element in elements:
|
|
|
|
if isinstance(element, NarrativeText):
|
|
|
|
first_narrative_element = element
|
|
|
|
break
|
|
|
|
assert len(first_narrative_element.text) > 1000
|
|
|
|
assert first_narrative_element.text.startswith("Abstract. Recent advances")
|
|
|
|
assert first_narrative_element.text.endswith("https://layout-parser.github.io.")
|
2023-07-05 15:02:22 -05:00
|
|
|
assert first_narrative_element.metadata.filename == "layout-parser-paper-fast.pdf"
|
2023-04-19 13:54:17 -04:00
|
|
|
|
|
|
|
|
2023-03-10 22:16:05 -05:00
|
|
|
def test_partition_pdf_with_fast_strategy_from_file(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-03-10 22:16:05 -05:00
|
|
|
):
|
|
|
|
with open(filename, "rb") as f:
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(file=f, url=None, strategy=PartitionStrategy.FAST)
|
2023-03-10 22:16:05 -05:00
|
|
|
assert len(elements) > 10
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_and_page_breaks(
|
2023-04-13 11:46:35 -04:00
|
|
|
caplog,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-03-10 22:16:05 -05:00
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
url=None,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.FAST,
|
2023-03-10 22:16:05 -05:00
|
|
|
include_page_breaks=True,
|
|
|
|
)
|
|
|
|
assert len(elements) > 10
|
2023-06-28 23:14:05 -04:00
|
|
|
assert "PageBreak" in [elem.category for elem in elements]
|
2023-03-10 22:16:05 -05:00
|
|
|
|
2023-05-31 13:50:15 -05:00
|
|
|
assert "unstructured_inference is not installed" not in caplog.text
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "layout-parser-paper-fast.pdf"
|
2023-04-13 11:46:35 -04:00
|
|
|
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
def test_partition_pdf_raises_with_bad_strategy(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-03-10 22:16:05 -05:00
|
|
|
):
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy="made_up")
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_fast(
|
|
|
|
monkeypatch,
|
2023-04-13 11:46:35 -04:00
|
|
|
caplog,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-03-10 22:16:05 -05:00
|
|
|
):
|
2023-05-08 13:21:24 -04:00
|
|
|
def mock_exists(dep):
|
2023-05-31 13:50:15 -05:00
|
|
|
return dep not in ["unstructured_inference", "pytesseract"]
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
|
|
|
2024-05-15 23:02:56 -07:00
|
|
|
mock_return = [[Text("Hello there!")], []]
|
2023-05-08 13:21:24 -04:00
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
2023-07-07 23:41:37 -05:00
|
|
|
"extractable_elements",
|
2023-05-08 13:21:24 -04:00
|
|
|
return_value=mock_return,
|
|
|
|
) as mock_partition:
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.HI_RES)
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
2023-05-31 13:50:15 -05:00
|
|
|
assert "unstructured_inference is not installed" in caplog.text
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_fast_from_ocr_only(
|
|
|
|
monkeypatch,
|
|
|
|
caplog,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-05-08 13:21:24 -04:00
|
|
|
):
|
|
|
|
def mock_exists(dep):
|
|
|
|
return dep not in ["pytesseract"]
|
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
2023-03-10 22:16:05 -05:00
|
|
|
|
2024-05-15 23:02:56 -07:00
|
|
|
mock_return = [[Text("Hello there!")], []]
|
2023-03-10 22:16:05 -05:00
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
2023-07-07 23:41:37 -05:00
|
|
|
"extractable_elements",
|
2023-03-10 22:16:05 -05:00
|
|
|
return_value=mock_return,
|
2023-08-23 03:43:33 +01:00
|
|
|
) as mock_partition, mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_with_ocr",
|
|
|
|
) as mock_partition_ocr:
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.OCR_ONLY)
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
2023-08-23 03:43:33 +01:00
|
|
|
mock_partition_ocr.assert_not_called()
|
2023-05-08 13:21:24 -04:00
|
|
|
assert "pytesseract is not installed" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_hi_res_from_ocr_only(
|
|
|
|
monkeypatch,
|
|
|
|
caplog,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-05-08 13:21:24 -04:00
|
|
|
):
|
|
|
|
def mock_exists(dep):
|
|
|
|
return dep not in ["pytesseract"]
|
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
2023-07-07 23:41:37 -05:00
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_local",
|
|
|
|
return_value=mock_return,
|
|
|
|
) as mock_partition:
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.OCR_ONLY)
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
|
|
|
assert "pytesseract is not installed" in caplog.text
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_falls_back_to_ocr_only(
|
|
|
|
monkeypatch,
|
|
|
|
caplog,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-05-08 13:21:24 -04:00
|
|
|
):
|
|
|
|
def mock_exists(dep):
|
2023-05-31 13:50:15 -05:00
|
|
|
return dep not in ["unstructured_inference"]
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
|
|
|
|
|
|
|
mock_return = [Text("Hello there!")]
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_with_ocr",
|
|
|
|
return_value=mock_return,
|
2023-03-10 22:16:05 -05:00
|
|
|
) as mock_partition:
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(filename=filename, url=None, strategy=PartitionStrategy.HI_RES)
|
2023-03-10 22:16:05 -05:00
|
|
|
|
|
|
|
mock_partition.assert_called_once()
|
2023-05-31 13:50:15 -05:00
|
|
|
assert "unstructured_inference is not installed" in caplog.text
|
2023-04-21 12:01:29 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_uses_table_extraction():
|
2023-11-03 08:02:43 -07:00
|
|
|
filename = example_doc_path("layout-parser-paper-fast.pdf")
|
2023-04-21 12:01:29 -05:00
|
|
|
with mock.patch(
|
2023-12-01 12:56:31 -08:00
|
|
|
"unstructured.partition.pdf_image.ocr.process_file_with_ocr",
|
2023-04-21 12:01:29 -05:00
|
|
|
) as mock_process_file_with_model:
|
2023-04-21 13:48:19 -05:00
|
|
|
pdf.partition_pdf(filename, infer_table_structure=True)
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
assert mock_process_file_with_model.call_args[1]["infer_table_structure"]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("ocr_mode"),
|
|
|
|
[
|
|
|
|
("entire_page"),
|
|
|
|
("individual_blocks"),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_pdf_hi_table_extraction_with_languages(ocr_mode):
|
2023-11-03 08:02:43 -07:00
|
|
|
filename = example_doc_path("korean-text-with-tables.pdf")
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
ocr_mode=ocr_mode,
|
|
|
|
languages=["kor"],
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
infer_table_structure=True,
|
|
|
|
)
|
|
|
|
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
assert elements[0].metadata.languages == ["kor"]
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
assert len(table) == 2
|
2024-06-14 11:11:38 -07:00
|
|
|
assert "<table><thead><tr>" in table[0]
|
|
|
|
assert "</thead><tbody><tr>" in table[0]
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
# FIXME(yuming): didn't test full sentence here since unit test and docker test have
|
|
|
|
# some differences on spaces between characters
|
|
|
|
assert "업" in table[0]
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("strategy"),
|
|
|
|
[
|
2023-11-15 21:41:02 -08:00
|
|
|
(PartitionStrategy.FAST),
|
|
|
|
(PartitionStrategy.HI_RES),
|
|
|
|
(PartitionStrategy.OCR_ONLY),
|
Jj/2011 missing languages metadata (#2037)
### Summary
Closes #2011
`languages` was missing from the metadata when partitioning pdfs via
`hi_res` and `fast` strategies and missing from image partitions via
`hi_res`. This PR adds `languages` to the relevant function calls so it
is included in the resulting elements.
### Testing
On the main branch, `partition_image` will include `languages` when
`strategy='ocr_only'`, but not when `strategy='hi_res'`:
```
filename = "example-docs/english-and-korean.png"
from unstructured.partition.image import partition_image
elements = partition_image(filename, strategy="ocr_only", languages=['eng', 'kor'])
elements[0].metadata.languages
elements = partition_image(filename, strategy="hi_res", languages=['eng', 'kor'])
elements[0].metadata.languages
```
For `partition_pdf`, `'ocr_only'` will include `languages` in the
metadata, but `'fast'` and `'hi_res'` will not.
```
filename = "example-docs/korean-text-with-tables.pdf"
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(filename, strategy="ocr_only", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="fast", languages=['kor'])
elements[0].metadata.languages
elements = partition_pdf(filename, strategy="hi_res", languages=['kor'])
elements[0].metadata.languages
```
On this branch, `languages` is included in the metadata regardless of
strategy
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Coniferish <Coniferish@users.noreply.github.com>
2023-11-13 10:47:05 -06:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_pdf_strategies_keep_languages_metadata(strategy):
|
|
|
|
filename = example_doc_path("korean-text-with-tables.pdf")
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
languages=["kor"],
|
|
|
|
strategy=strategy,
|
|
|
|
)
|
|
|
|
assert elements[0].metadata.languages == ["kor"]
|
|
|
|
|
|
|
|
|
2023-10-24 13:13:28 -04:00
|
|
|
@pytest.mark.parametrize(
|
2023-11-01 11:34:27 -07:00
|
|
|
"ocr_mode",
|
2023-10-24 13:13:28 -04:00
|
|
|
[
|
2023-11-01 11:34:27 -07:00
|
|
|
"entire_page",
|
|
|
|
"individual_blocks",
|
2023-10-24 13:13:28 -04:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_pdf_hi_res_ocr_mode_with_table_extraction(ocr_mode):
|
2023-11-03 08:02:43 -07:00
|
|
|
filename = example_doc_path("layout-parser-paper.pdf")
|
2023-10-24 13:13:28 -04:00
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
ocr_mode=ocr_mode,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-10-24 13:13:28 -04:00
|
|
|
infer_table_structure=True,
|
|
|
|
)
|
|
|
|
table = [el.metadata.text_as_html for el in elements if el.metadata.text_as_html]
|
|
|
|
assert len(table) == 2
|
2024-06-14 11:11:38 -07:00
|
|
|
assert "<table><thead><tr>" in table[0]
|
|
|
|
assert "</thead><tbody><tr>" in table[0]
|
2023-10-24 13:13:28 -04:00
|
|
|
assert "Layouts of history Japanese documents" in table[0]
|
2023-11-09 10:29:55 -08:00
|
|
|
assert "Layouts of scanned modern magazines and scientific report" in table[0]
|
|
|
|
assert "Layouts of scanned US newspapers from the 20th century" in table[0]
|
2023-10-24 13:13:28 -04:00
|
|
|
|
|
|
|
|
2023-04-21 17:35:43 -04:00
|
|
|
def test_partition_pdf_with_copy_protection():
|
|
|
|
filename = os.path.join("example-docs", "copy-protected.pdf")
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES)
|
2023-10-05 00:41:38 -07:00
|
|
|
title = "LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis"
|
2024-01-29 10:44:41 -06:00
|
|
|
idx = 2
|
2023-10-05 00:41:38 -07:00
|
|
|
assert elements[idx].text == title
|
2023-06-15 12:21:17 -04:00
|
|
|
assert {element.metadata.page_number for element in elements} == {1, 2}
|
2023-10-05 00:41:38 -07:00
|
|
|
assert elements[idx].metadata.detection_class_prob is not None
|
|
|
|
assert isinstance(elements[idx].metadata.detection_class_prob, float)
|
2023-07-26 09:26:06 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_dpi():
|
|
|
|
filename = os.path.join("example-docs", "copy-protected.pdf")
|
|
|
|
with mock.patch.object(layout, "process_file_with_model", mock.MagicMock()) as mock_process:
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, pdf_image_dpi=100)
|
2023-10-24 13:13:28 -04:00
|
|
|
assert mock_process.call_args[1]["pdf_image_dpi"] == 100
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
|
2023-11-03 08:02:43 -07:00
|
|
|
def test_partition_pdf_requiring_recursive_text_grab(filename=example_doc_path("reliance.pdf")):
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
|
2023-06-22 11:19:54 -04:00
|
|
|
assert len(elements) > 50
|
|
|
|
assert elements[0].metadata.page_number == 1
|
|
|
|
assert elements[-1].metadata.page_number == 3
|
|
|
|
|
|
|
|
|
2023-11-14 10:46:41 -08:00
|
|
|
def test_partition_pdf_text_not_extractable():
|
|
|
|
filename = example_doc_path("loremipsum-flat.pdf")
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
|
2023-11-14 10:46:41 -08:00
|
|
|
assert len(elements) == 0
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_fails_if_pdf_not_processable(
|
|
|
|
monkeypatch,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-04-21 17:35:43 -04:00
|
|
|
):
|
2023-05-08 13:21:24 -04:00
|
|
|
def mock_exists(dep):
|
2023-05-31 13:50:15 -05:00
|
|
|
return dep not in ["unstructured_inference", "pytesseract"]
|
2023-05-08 13:21:24 -04:00
|
|
|
|
|
|
|
monkeypatch.setattr(strategies, "dependency_exists", mock_exists)
|
2023-07-07 23:41:37 -05:00
|
|
|
monkeypatch.setattr(pdf, "extractable_elements", lambda *args, **kwargs: [])
|
2023-04-21 17:35:43 -04:00
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
pdf.partition_pdf(filename=filename)
|
2023-05-03 18:33:24 -04:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_fast_groups_text_in_text_box():
|
|
|
|
filename = os.path.join("example-docs", "chevron-page.pdf")
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.FAST)
|
2023-07-05 11:25:11 -07:00
|
|
|
expected_coordinate_points_0 = (
|
|
|
|
(193.1741, 71.94000000000005),
|
|
|
|
(193.1741, 91.94000000000005),
|
|
|
|
(418.6881, 91.94000000000005),
|
|
|
|
(418.6881, 71.94000000000005),
|
|
|
|
)
|
|
|
|
expected_coordinate_system_0 = PixelSpace(width=612, height=792)
|
|
|
|
expected_elem_metadata_0 = ElementMetadata(
|
|
|
|
coordinates=CoordinatesMetadata(
|
|
|
|
points=expected_coordinate_points_0,
|
|
|
|
system=expected_coordinate_system_0,
|
2023-05-20 16:26:55 -05:00
|
|
|
),
|
|
|
|
)
|
2023-07-26 15:10:14 -04:00
|
|
|
assert elements[0] == Title(
|
|
|
|
"eastern mediterranean",
|
|
|
|
metadata=expected_elem_metadata_0,
|
|
|
|
)
|
2023-05-03 18:33:24 -04:00
|
|
|
assert isinstance(elements[1], NarrativeText)
|
|
|
|
assert str(elements[1]).startswith("We")
|
|
|
|
assert str(elements[1]).endswith("Jordan and Egypt.")
|
|
|
|
|
2023-07-05 11:25:11 -07:00
|
|
|
expected_coordinate_points_3 = (
|
2023-08-24 17:46:19 -07:00
|
|
|
(95.6683, 181.16470000000004),
|
|
|
|
(95.6683, 226.16470000000004),
|
|
|
|
(166.7908, 226.16470000000004),
|
|
|
|
(166.7908, 181.16470000000004),
|
2023-07-05 11:25:11 -07:00
|
|
|
)
|
|
|
|
expected_coordinate_system_3 = PixelSpace(width=612, height=792)
|
|
|
|
expected_elem_metadata_3 = ElementMetadata(
|
|
|
|
coordinates=CoordinatesMetadata(
|
|
|
|
points=expected_coordinate_points_3,
|
|
|
|
system=expected_coordinate_system_3,
|
2023-05-20 16:26:55 -05:00
|
|
|
),
|
2023-05-03 18:33:24 -04:00
|
|
|
)
|
2023-09-28 20:48:02 -07:00
|
|
|
assert elements[2] == Text("2.5", metadata=expected_elem_metadata_3)
|
2023-06-30 09:44:46 -05:00
|
|
|
|
|
|
|
|
2023-07-05 15:02:22 -05:00
|
|
|
def test_partition_pdf_with_metadata_filename(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-07-05 15:02:22 -05:00
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
url=None,
|
|
|
|
include_page_breaks=True,
|
|
|
|
metadata_filename="test",
|
|
|
|
)
|
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "test"
|
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_fast_strategy_from_file_with_metadata_filename(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-07-05 15:02:22 -05:00
|
|
|
):
|
|
|
|
with open(filename, "rb") as f:
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
|
|
|
url=None,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.FAST,
|
2023-07-26 15:10:14 -04:00
|
|
|
metadata_filename="test",
|
|
|
|
)
|
2023-07-05 15:02:22 -05:00
|
|
|
for element in elements:
|
|
|
|
assert element.metadata.filename == "test"
|
|
|
|
|
|
|
|
|
2023-11-03 08:02:43 -07:00
|
|
|
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
2023-11-15 21:41:02 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"strategy",
|
|
|
|
[
|
|
|
|
PartitionStrategy.AUTO,
|
|
|
|
PartitionStrategy.HI_RES,
|
|
|
|
PartitionStrategy.FAST,
|
|
|
|
PartitionStrategy.OCR_ONLY,
|
|
|
|
],
|
|
|
|
)
|
2023-11-03 08:02:43 -07:00
|
|
|
def test_partition_pdf_exclude_metadata(
|
|
|
|
file_mode,
|
|
|
|
strategy,
|
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-06-30 09:44:46 -05:00
|
|
|
):
|
2023-11-03 08:02:43 -07:00
|
|
|
if file_mode == "filename":
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = pdf.partition_pdf(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=filename,
|
|
|
|
strategy=strategy,
|
2023-07-26 15:10:14 -04:00
|
|
|
include_metadata=False,
|
|
|
|
)
|
2023-11-03 08:02:43 -07:00
|
|
|
else:
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
|
|
|
url=None,
|
|
|
|
strategy=strategy,
|
|
|
|
include_metadata=False,
|
|
|
|
)
|
|
|
|
|
2023-06-30 09:44:46 -05:00
|
|
|
for i in range(len(elements)):
|
|
|
|
assert elements[i].metadata.to_dict() == {}
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
|
2023-11-03 08:02:43 -07:00
|
|
|
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
2023-11-15 21:41:02 -08:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"strategy",
|
|
|
|
[
|
|
|
|
PartitionStrategy.AUTO,
|
|
|
|
PartitionStrategy.HI_RES,
|
|
|
|
PartitionStrategy.FAST,
|
|
|
|
PartitionStrategy.OCR_ONLY,
|
|
|
|
],
|
|
|
|
)
|
2023-11-03 08:02:43 -07:00
|
|
|
@pytest.mark.parametrize("last_modification_date", [None, "2020-07-05T09:24:28"])
|
2024-03-18 02:09:44 +01:00
|
|
|
@pytest.mark.parametrize("date_from_file_object", [True, False])
|
2023-11-03 08:02:43 -07:00
|
|
|
def test_partition_pdf_metadata_date(
|
2023-07-26 15:10:14 -04:00
|
|
|
mocker,
|
2023-11-03 08:02:43 -07:00
|
|
|
file_mode,
|
|
|
|
strategy,
|
|
|
|
last_modification_date,
|
2024-03-18 02:09:44 +01:00
|
|
|
date_from_file_object,
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("copy-protected.pdf"),
|
2023-07-26 15:10:14 -04:00
|
|
|
):
|
|
|
|
mocked_last_modification_date = "2029-07-05T09:24:28"
|
2023-11-03 08:02:43 -07:00
|
|
|
expected_last_modification_date = (
|
|
|
|
last_modification_date if last_modification_date else mocked_last_modification_date
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
2024-03-18 02:09:44 +01:00
|
|
|
if not date_from_file_object and not last_modification_date and file_mode != "filename":
|
|
|
|
expected_last_modification_date = None
|
2023-07-26 15:10:14 -04:00
|
|
|
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date_from_file",
|
2024-03-18 02:09:44 +01:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
mocker.patch(
|
2024-05-17 13:55:11 -07:00
|
|
|
"unstructured.partition.pdf_image.pdf_image_utils.get_last_modified_date",
|
2023-07-26 15:10:14 -04:00
|
|
|
return_value=mocked_last_modification_date,
|
|
|
|
)
|
|
|
|
|
2023-11-03 08:02:43 -07:00
|
|
|
if file_mode == "filename":
|
2023-07-26 15:10:14 -04:00
|
|
|
elements = pdf.partition_pdf(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=filename,
|
|
|
|
strategy=strategy,
|
|
|
|
metadata_last_modified=last_modification_date,
|
2024-03-18 02:09:44 +01:00
|
|
|
date_from_file_object=date_from_file_object,
|
2023-07-26 15:10:14 -04:00
|
|
|
)
|
2023-11-03 08:02:43 -07:00
|
|
|
elif file_mode == "rb":
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
|
|
|
strategy=strategy,
|
|
|
|
metadata_last_modified=last_modification_date,
|
2024-03-18 02:09:44 +01:00
|
|
|
date_from_file_object=date_from_file_object,
|
2023-11-03 08:02:43 -07:00
|
|
|
)
|
|
|
|
else:
|
|
|
|
with open(filename, "rb") as test_file:
|
|
|
|
spooled_temp_file = SpooledTemporaryFile()
|
|
|
|
spooled_temp_file.write(test_file.read())
|
|
|
|
spooled_temp_file.seek(0)
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=spooled_temp_file,
|
|
|
|
strategy=strategy,
|
|
|
|
metadata_last_modified=last_modification_date,
|
2024-03-18 02:09:44 +01:00
|
|
|
date_from_file_object=date_from_file_object,
|
2023-11-03 08:02:43 -07:00
|
|
|
)
|
2023-07-26 15:10:14 -04:00
|
|
|
|
2023-11-03 08:02:43 -07:00
|
|
|
assert {el.metadata.last_modified for el in elements} == {expected_last_modification_date}
|
2023-08-25 00:32:12 -05:00
|
|
|
|
|
|
|
|
2023-11-15 21:41:02 -08:00
|
|
|
@pytest.mark.parametrize("strategy", [PartitionStrategy.FAST, PartitionStrategy.HI_RES])
|
2023-10-12 12:47:55 -07:00
|
|
|
def test_partition_pdf_with_json(strategy: str):
|
|
|
|
elements = pdf.partition_pdf(
|
2023-10-12 17:33:10 -04:00
|
|
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
|
|
|
strategy=strategy,
|
2023-10-12 12:47:55 -07:00
|
|
|
)
|
|
|
|
assert_round_trips_through_JSON(elements)
|
2023-08-29 16:59:26 -04:00
|
|
|
|
|
|
|
|
2023-10-03 09:40:34 -07:00
|
|
|
def test_add_chunking_strategy_by_title_on_partition_pdf(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-09-11 16:00:14 -05:00
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(filename=filename)
|
|
|
|
chunk_elements = pdf.partition_pdf(filename, chunking_strategy="by_title")
|
|
|
|
chunks = chunk_by_title(elements)
|
|
|
|
assert chunk_elements != elements
|
|
|
|
assert chunk_elements == chunks
|
2023-09-12 12:15:26 -04:00
|
|
|
|
|
|
|
|
2023-09-18 11:42:02 -04:00
|
|
|
def test_partition_pdf_formats_languages_for_tesseract():
|
2023-11-03 08:02:43 -07:00
|
|
|
filename = example_doc_path("DA-1p.pdf")
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
with mock.patch.object(ocr, "process_file_with_ocr", mock.MagicMock()) as mock_process:
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, languages=["en"])
|
Chore (refactor): support table extraction with pre-computed ocr data (#1801)
### Summary
Table OCR refactor, move the OCR part for table model in inference repo
to unst repo.
* Before this PR, table model extracts OCR tokens with texts and
bounding box and fills the tokens to the table structure in inference
repo. This means we need to do an additional OCR for tables.
* After this PR, we use the OCR data from entire page OCR and pass the
OCR tokens to inference repo, which means we only do one OCR for the
entire document.
**Tech details:**
* Combined env `ENTIRE_PAGE_OCR` and `TABLE_OCR` to `OCR_AGENT`, this
means we use the same OCR agent for entire page and tables since we only
do one OCR.
* Bump inference repo to `0.7.9`, which allow table model in inference
to use pre-computed OCR data from unst repo. Please check in
[PR](https://github.com/Unstructured-IO/unstructured-inference/pull/256).
* All notebooks lint are made by `make tidy`
* This PR also fixes
[issue](https://github.com/Unstructured-IO/unstructured/issues/1564),
I've added test for the issue in
`test_pdf.py::test_partition_pdf_hi_table_extraction_with_languages`
* Add same scaling logic to image [similar to previous Table
OCR](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L109C1-L113),
but now scaling is applied to entire image
### Test
* Not much to manually testing expect table extraction still works
* But due to change on scaling and use pre-computed OCR data from entire
page, there are some slight (better) changes on table output, here is an
comparison on test outputs i found from the same test
`test_partition_image_with_table_extraction`:
screen shot for table in `layout-parser-paper-with-table.jpg`:
<img width="343" alt="expected"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/278d7665-d212-433d-9a05-872c4502725c">
before refactor:
<img width="709" alt="before"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/347fbc3b-f52b-45b5-97e9-6f633eaa0d5e">
after refactor:
<img width="705" alt="after"
src="https://github.com/Unstructured-IO/unstructured/assets/63475068/b3cbd809-cf67-4e75-945a-5cbd06b33b2d">
### TODO
(added as a ticket) Still have some clean up to do in inference repo
since now unst repo have duplicate logic, but can keep them as a fall
back plan. If we want to remove anything OCR related in inference, here
are items that is deprecated and can be removed:
*
[`get_tokens`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L77)
(already noted in code)
* parameter `extract_tables` in inference
*
[`interpret_table_block`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/inference/layoutelement.py#L88)
*
[`load_agent`](https://github.com/Unstructured-IO/unstructured-inference/blob/main/unstructured_inference/models/tables.py#L197)
* env `TABLE_OCR`
### Note
if we want to fallback for an additional table OCR (may need this for
using paddle for table), we need to:
* pass `infer_table_structure` to inference with `extract_tables`
parameter
* stop passing `infer_table_structure` to `ocr.py`
---------
Co-authored-by: Yao You <yao@unstructured.io>
2023-10-20 20:24:23 -04:00
|
|
|
assert mock_process.call_args[1]["ocr_languages"] == "eng"
|
2023-09-18 11:42:02 -04:00
|
|
|
|
|
|
|
|
2023-09-12 12:15:26 -04:00
|
|
|
def test_partition_pdf_warns_with_ocr_languages(caplog):
|
2023-11-03 08:02:43 -07:00
|
|
|
filename = example_doc_path("chevron-page.pdf")
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, ocr_languages="eng")
|
2023-09-12 12:15:26 -04:00
|
|
|
assert "The ocr_languages kwarg will be deprecated" in caplog.text
|
2023-09-12 15:32:48 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_categorization_backup():
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
text = "This is Clearly a Title"
|
2023-09-12 15:32:48 -05:00
|
|
|
with mock.patch.object(pdf, "_partition_pdf_or_image_local", return_value=[Text(text)]):
|
|
|
|
elements = pdf.partition_pdf_or_image(
|
2023-11-03 08:02:43 -07:00
|
|
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-09-12 15:32:48 -05:00
|
|
|
)
|
|
|
|
# Should have changed the element class from Text to Title
|
|
|
|
assert isinstance(elements[0], Title)
|
|
|
|
assert elements[0].text == text
|
feat: pdf auto strategy groups broken numbered and bullet list items(#1393)
**Summary**
Adds logic to combine broken numbered list for pdf fast strategy.
**Details**
Previously the document reads the numbered list items part of the
`layout-parser-paper-fast.pdf` file as:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character'
'recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that'
'underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model'
'tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
Now it reads:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model' tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
The added logic leverages `ElementType` and `coordinates` to determine
whether the following lines is a part of the previously detected
`ListItem` or not.
**Test**
Add test that checks the element length less than original version with
broken numbered list. The test also checks whether the first detected
numbered list ends with previously broken line.
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-13 17:30:06 -04:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
2023-11-03 08:02:43 -07:00
|
|
|
[example_doc_path("layout-parser-paper-fast.pdf")],
|
feat: pdf auto strategy groups broken numbered and bullet list items(#1393)
**Summary**
Adds logic to combine broken numbered list for pdf fast strategy.
**Details**
Previously the document reads the numbered list items part of the
`layout-parser-paper-fast.pdf` file as:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character'
'recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that'
'underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model'
'tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
Now it reads:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model' tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
The added logic leverages `ElementType` and `coordinates` to determine
whether the following lines is a part of the previously detected
`ListItem` or not.
**Test**
Add test that checks the element length less than original version with
broken numbered list. The test also checks whether the first detected
numbered list ends with previously broken line.
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-13 17:30:06 -04:00
|
|
|
)
|
|
|
|
def test_combine_numbered_list(filename):
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
|
feat: pdf auto strategy groups broken numbered and bullet list items(#1393)
**Summary**
Adds logic to combine broken numbered list for pdf fast strategy.
**Details**
Previously the document reads the numbered list items part of the
`layout-parser-paper-fast.pdf` file as:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character'
'recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that'
'underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model'
'tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
Now it reads:
```
'1. An off-the-shelf toolkit for applying DL models for layout detection, character recognition, and other DIA tasks (Section 3)'
'2. A rich repository of pre-trained neural network models (Model Zoo) that underlies the off-the-shelf usage'
'3. Comprehensive tools for efficient document image data annotation and model' tuning to support different levels of customization'
'4. A DL model hub and community platform for the easy sharing, distribu- tion, and discussion of DIA models and pipelines, to promote reusability, reproducibility, and extensibility (Section 4)'
```
The added logic leverages `ElementType` and `coordinates` to determine
whether the following lines is a part of the previously detected
`ListItem` or not.
**Test**
Add test that checks the element length less than original version with
broken numbered list. The test also checks whether the first detected
numbered list ends with previously broken line.
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-13 17:30:06 -04:00
|
|
|
first_list_element = None
|
|
|
|
for element in elements:
|
|
|
|
if isinstance(element, ListItem):
|
|
|
|
first_list_element = element
|
|
|
|
break
|
|
|
|
assert len(elements) < 28
|
2023-10-11 13:38:36 -07:00
|
|
|
assert len([element for element in elements if isinstance(element, ListItem)]) == 4
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
assert first_list_element.text.endswith(
|
|
|
|
"character recognition, and other DIA tasks (Section 3)",
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
2023-11-03 08:02:43 -07:00
|
|
|
[example_doc_path("layout-parser-paper-fast.pdf")],
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
)
|
2023-10-03 11:25:20 -04:00
|
|
|
def test_partition_pdf_hyperlinks(filename):
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
links = [
|
|
|
|
{
|
|
|
|
"text": "8",
|
|
|
|
"url": "cite.gardner2018allennlp",
|
|
|
|
"start_index": 138,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"text": "34",
|
|
|
|
"url": "cite.wolf2019huggingface",
|
|
|
|
"start_index": 141,
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"text": "35",
|
|
|
|
"url": "cite.wu2019detectron2",
|
|
|
|
"start_index": 168,
|
|
|
|
},
|
|
|
|
]
|
|
|
|
assert elements[-1].metadata.links == links
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
"filename",
|
2023-11-03 08:02:43 -07:00
|
|
|
[example_doc_path("embedded-link.pdf")],
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
)
|
2023-10-03 11:25:20 -04:00
|
|
|
def test_partition_pdf_hyperlinks_multiple_lines(filename):
|
2023-11-15 21:41:02 -08:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.AUTO)
|
feat: get embedded url, associate text and start index for pdf (#1539)
**Executive Summary**
Adds PDF functionality to capture hyperlink (external or internal) for
pdf fast strategy along with associate text.
**Technical Details**
- `pdfminer` associates `annotation` (links and uris) with bounding box
rather than text. Therefore, the link and text matching is not a perfect
pair but rather a logic-based and calculation matching from bounding box
overlapping.
- There is no word-level bounding box. Only character-level (access
using `LTChar`). Thus in order to get to word-level, there is a window
slicing through the text. The words are captured in alphanumeric and
non-alphanumeric separately, meaning it will split the word if contains
both, on the first encounter of non-alphanumeric.)
- The bounding box calculation is calculated using start and stop
coordinates for the corresponding word calculated from above. The
calculation is simply using distance between two dots.
The result now contains `links` in `metadata` as shown below:
```
"links": [
{
"text": "link",
"url": "https://github.com/Unstructured-IO/unstructured",
"start_index": 12
},
{
"text": "email",
"url": "mailto:unstructuredai@earlygrowth.com",
"start_index": 30
},
{
"text": "phone number",
"url": "tel:6505124019",
"start_index": 49
}
]
```
---------
Co-authored-by: ryannikolaidis <1208590+ryannikolaidis@users.noreply.github.com>
Co-authored-by: Klaijan <Klaijan@users.noreply.github.com>
2023-09-27 13:43:32 -04:00
|
|
|
assert elements[-1].metadata.links[-1]["text"] == "capturing"
|
|
|
|
assert len(elements[-1].metadata.links) == 2
|
2023-09-15 15:09:58 -05:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_uses_model_name():
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_local",
|
|
|
|
) as mockpartition:
|
|
|
|
pdf.partition_pdf(
|
2023-11-03 08:02:43 -07:00
|
|
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-09-15 15:09:58 -05:00
|
|
|
model_name="test",
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-09-15 15:09:58 -05:00
|
|
|
)
|
|
|
|
|
|
|
|
mockpartition.assert_called_once()
|
|
|
|
assert "model_name" in mockpartition.call_args.kwargs
|
|
|
|
assert mockpartition.call_args.kwargs["model_name"]
|
2023-10-03 11:25:20 -04:00
|
|
|
|
|
|
|
|
2023-12-22 09:06:54 -06:00
|
|
|
def test_partition_pdf_uses_hi_res_model_name():
|
|
|
|
with mock.patch.object(
|
|
|
|
pdf,
|
|
|
|
"_partition_pdf_or_image_local",
|
|
|
|
) as mockpartition:
|
|
|
|
pdf.partition_pdf(
|
|
|
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
|
|
|
hi_res_model_name="test",
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
)
|
|
|
|
|
|
|
|
mockpartition.assert_called_once()
|
|
|
|
assert "hi_res_model_name" in mockpartition.call_args.kwargs
|
|
|
|
assert mockpartition.call_args.kwargs["hi_res_model_name"]
|
|
|
|
|
|
|
|
|
2023-10-03 11:25:20 -04:00
|
|
|
def test_partition_pdf_word_bbox_not_char(
|
2023-11-03 08:02:43 -07:00
|
|
|
filename=example_doc_path("interface-config-guide-p93.pdf"),
|
2023-10-03 11:25:20 -04:00
|
|
|
):
|
|
|
|
try:
|
2024-04-29 04:15:17 +02:00
|
|
|
elements = pdf.partition_pdf(filename=filename, strategy="fast")
|
2023-10-03 11:25:20 -04:00
|
|
|
except Exception as e:
|
|
|
|
raise ("Partitioning fail: %s" % e)
|
|
|
|
assert len(elements) == 17
|
2023-10-10 16:48:44 -04:00
|
|
|
|
|
|
|
|
2024-04-29 04:15:17 +02:00
|
|
|
def test_partition_pdf_fast_no_mapping_errors(
|
|
|
|
filename=example_doc_path("a1977-backus-p21.pdf"),
|
|
|
|
):
|
|
|
|
"""Verify there is no regression for https://github.com/Unstructured-IO/unstructured/pull/2940,
|
|
|
|
failing to map old parent_id's to new"""
|
|
|
|
pdf.partition_pdf(filename=filename, strategy="fast")
|
|
|
|
|
|
|
|
|
2023-10-10 20:47:56 -05:00
|
|
|
def test_partition_pdf_raises_TypeError_for_invalid_languages():
|
2023-11-03 08:02:43 -07:00
|
|
|
filename = example_doc_path("chevron-page.pdf")
|
2023-10-10 20:47:56 -05:00
|
|
|
with pytest.raises(TypeError):
|
2023-11-15 21:41:02 -08:00
|
|
|
pdf.partition_pdf(filename=filename, strategy=PartitionStrategy.HI_RES, languages="eng")
|
2023-10-10 20:47:56 -05:00
|
|
|
|
|
|
|
|
2023-10-10 16:48:44 -04:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("threshold", "expected"),
|
|
|
|
[
|
|
|
|
(0.4, [True, False, False, False, False]),
|
|
|
|
(0.1, [True, True, False, False, False]),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_check_annotations_within_element(threshold, expected):
|
|
|
|
annotations = [
|
|
|
|
{"bbox": [0, 0, 1, 1], "page_number": 1},
|
|
|
|
{"bbox": [0, 0, 3, 1], "page_number": 1},
|
|
|
|
{"bbox": [0, 0, 1, 1], "page_number": 2},
|
|
|
|
{"bbox": [0, 0, 0, 1], "page_number": 1},
|
|
|
|
{"bbox": [3, 0, 4, 1], "page_number": 1},
|
|
|
|
]
|
|
|
|
element_bbox = (0, 0, 1, 1)
|
|
|
|
filtered = pdf.check_annotations_within_element(annotations, element_bbox, 1, threshold)
|
|
|
|
results = [annotation in filtered for annotation in annotations]
|
|
|
|
assert results == expected
|
2023-10-12 20:28:46 -05:00
|
|
|
|
|
|
|
|
2023-10-27 15:37:45 -05:00
|
|
|
@pytest.mark.parametrize(
|
2024-01-29 10:44:41 -06:00
|
|
|
("env", "expected"),
|
2023-10-27 15:37:45 -05:00
|
|
|
[
|
2024-01-29 10:44:41 -06:00
|
|
|
(None, "yolox"),
|
|
|
|
("test", "test"),
|
2023-10-27 15:37:45 -05:00
|
|
|
],
|
|
|
|
)
|
2024-01-29 10:44:41 -06:00
|
|
|
def test_default_hi_res_model(env, expected, monkeypatch):
|
2023-10-27 15:37:45 -05:00
|
|
|
if env is not None:
|
|
|
|
monkeypatch.setenv("UNSTRUCTURED_HI_RES_MODEL_NAME", env)
|
2024-01-29 10:44:41 -06:00
|
|
|
assert pdf.default_hi_res_model() == expected
|
2023-10-27 15:37:45 -05:00
|
|
|
|
|
|
|
|
2023-10-20 16:08:17 -05:00
|
|
|
def test_partition_model_name_default_to_None():
|
2023-11-03 08:02:43 -07:00
|
|
|
filename = example_doc_path("DA-1p.pdf")
|
2023-10-20 16:08:17 -05:00
|
|
|
try:
|
|
|
|
pdf.partition_pdf(
|
|
|
|
filename=filename,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.HI_RES,
|
2023-10-20 16:08:17 -05:00
|
|
|
ocr_languages="eng",
|
|
|
|
model_name=None,
|
|
|
|
)
|
|
|
|
except AttributeError:
|
|
|
|
pytest.fail("partition_pdf() raised AttributeError unexpectedly!")
|
2023-10-24 11:54:19 -05:00
|
|
|
|
|
|
|
|
2023-12-22 09:06:54 -06:00
|
|
|
def test_partition_hi_res_model_name_default_to_None():
|
|
|
|
filename = example_doc_path("DA-1p.pdf")
|
|
|
|
try:
|
|
|
|
pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
hi_res_model_name=None,
|
|
|
|
)
|
|
|
|
except AttributeError:
|
|
|
|
pytest.fail("partition_pdf() raised AttributeError unexpectedly!")
|
|
|
|
|
|
|
|
|
2023-10-24 11:54:19 -05:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("strategy", "ocr_func"),
|
|
|
|
[
|
|
|
|
(
|
2023-11-15 21:41:02 -08:00
|
|
|
PartitionStrategy.HI_RES,
|
2023-10-24 11:54:19 -05:00
|
|
|
"unstructured_pytesseract.image_to_data",
|
|
|
|
),
|
|
|
|
(
|
2023-11-15 21:41:02 -08:00
|
|
|
PartitionStrategy.OCR_ONLY,
|
2023-10-30 13:13:29 -07:00
|
|
|
"unstructured_pytesseract.image_to_data",
|
|
|
|
),
|
|
|
|
(
|
2023-11-15 21:41:02 -08:00
|
|
|
PartitionStrategy.OCR_ONLY,
|
2023-10-30 13:13:29 -07:00
|
|
|
"unstructured_pytesseract.image_to_string",
|
2023-10-24 11:54:19 -05:00
|
|
|
),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_ocr_language_passes_through(strategy, ocr_func):
|
|
|
|
# Create an exception that will be raised directly after OCR is called to stop execution
|
|
|
|
class CallException(Exception):
|
|
|
|
pass
|
|
|
|
|
|
|
|
mock_ocr_func = mock.Mock(side_effect=CallException("Function called!"))
|
|
|
|
# Patch the ocr function with the mock that will record the call and then terminate
|
|
|
|
with mock.patch(ocr_func, mock_ocr_func), pytest.raises(CallException):
|
|
|
|
pdf.partition_pdf(
|
2023-11-03 08:02:43 -07:00
|
|
|
example_doc_path("layout-parser-paper-fast.pdf"),
|
2023-10-24 11:54:19 -05:00
|
|
|
strategy=strategy,
|
|
|
|
ocr_languages="kor",
|
|
|
|
)
|
|
|
|
# Check that the language parameter was passed down as expected
|
|
|
|
kwargs = mock_ocr_func.call_args.kwargs
|
|
|
|
assert "lang" in kwargs
|
|
|
|
assert kwargs["lang"] == "kor"
|
2023-10-30 13:13:29 -07:00
|
|
|
|
|
|
|
|
2023-10-30 20:38:57 -04:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("annots", "height", "coordinate_system", "page_number", "expected"),
|
|
|
|
[
|
|
|
|
(["BS", "BE"], 300, PixelSpace(300, 300), 1, 0),
|
|
|
|
(
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"Type": "/'Annot'",
|
|
|
|
"Subtype": "/'Link'",
|
|
|
|
"A": {
|
|
|
|
"Type": "/'Action'",
|
|
|
|
"S": "/'URI'",
|
|
|
|
"URI": "b'https://layout-parser.github.io'",
|
|
|
|
},
|
|
|
|
"BS": {"S": "/'S'", "W": 1},
|
|
|
|
"Border": [0, 0, 1],
|
|
|
|
"C": [0, 1, 1],
|
|
|
|
"H": "/'I'",
|
|
|
|
"Rect": [304.055, 224.156, 452.472, 234.368],
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"Type": "/'Annot'",
|
|
|
|
"Subtype": "/'Link'",
|
|
|
|
"A": {"S": "/'GoTo'", "D": "b'cite.harley2015evaluation'"},
|
|
|
|
"BS": {"S": "/'S'", "W": 1},
|
|
|
|
"Border": [0, 0, 1],
|
|
|
|
"C": [0, 1, 0],
|
|
|
|
"H": "/'I'",
|
2023-11-29 14:00:15 -05:00
|
|
|
"Rect": (468.305, 128.081, 480.26, 136.494),
|
2023-10-30 20:38:57 -04:00
|
|
|
},
|
|
|
|
],
|
|
|
|
792,
|
|
|
|
PixelSpace(612, 792),
|
|
|
|
1,
|
|
|
|
2,
|
|
|
|
),
|
2023-11-29 14:00:15 -05:00
|
|
|
(
|
|
|
|
[
|
|
|
|
{
|
|
|
|
"Type": "/'Annot'",
|
|
|
|
"Subtype": "/'Link'",
|
|
|
|
"A": {
|
|
|
|
"Type": "/'Action'",
|
|
|
|
"S": "/'URI'",
|
|
|
|
"URI": "b'https://layout-parser.github.io'",
|
|
|
|
},
|
|
|
|
"BS": {"S": "/'S'", "W": 1},
|
|
|
|
"Border": [0, 0, 1],
|
|
|
|
"C": [0, 1, 1],
|
|
|
|
"H": "/'I'",
|
|
|
|
"Rect": "I am not a tuple or list!",
|
|
|
|
},
|
|
|
|
{
|
|
|
|
"Type": "/'Annot'",
|
|
|
|
"Subtype": "/'Link'",
|
|
|
|
"A": {"S": "/'GoTo'", "D": "b'cite.harley2015evaluation'"},
|
|
|
|
"BS": {"S": "/'S'", "W": 1},
|
|
|
|
"Border": [0, 0, 1],
|
|
|
|
"C": [0, 1, 0],
|
|
|
|
"H": "/'I'",
|
|
|
|
"Rect": (468.305, 128.081, 480.26),
|
|
|
|
},
|
|
|
|
],
|
|
|
|
792,
|
|
|
|
PixelSpace(612, 792),
|
|
|
|
1,
|
|
|
|
0,
|
|
|
|
),
|
2023-10-30 20:38:57 -04:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_get_uris_from_annots_string_annotation(
|
|
|
|
annots, height, coordinate_system, page_number, expected
|
|
|
|
):
|
|
|
|
annotation_list = get_uris_from_annots(annots, height, coordinate_system, page_number)
|
|
|
|
assert len(annotation_list) == expected
|
|
|
|
|
|
|
|
|
2023-10-30 13:13:29 -07:00
|
|
|
@pytest.mark.parametrize("file_mode", ["filename", "rb", "spool"])
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "is_image"),
|
|
|
|
[
|
2023-11-03 08:02:43 -07:00
|
|
|
(example_doc_path("layout-parser-paper-fast.pdf"), False),
|
|
|
|
(example_doc_path("layout-parser-paper-fast.jpg"), True),
|
2023-10-30 13:13:29 -07:00
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_partition_pdf_with_ocr_only_strategy(
|
|
|
|
file_mode,
|
|
|
|
filename,
|
|
|
|
is_image,
|
|
|
|
):
|
|
|
|
if file_mode == "filename":
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
2023-10-30 13:13:29 -07:00
|
|
|
languages=["eng"],
|
|
|
|
is_image=is_image,
|
|
|
|
)
|
|
|
|
elif file_mode == "rb":
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
2023-10-30 13:13:29 -07:00
|
|
|
languages=["eng"],
|
|
|
|
is_image=is_image,
|
|
|
|
)
|
|
|
|
else:
|
|
|
|
with open(filename, "rb") as test_file:
|
|
|
|
spooled_temp_file = SpooledTemporaryFile()
|
|
|
|
spooled_temp_file.write(test_file.read())
|
|
|
|
spooled_temp_file.seek(0)
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=spooled_temp_file,
|
2023-11-15 21:41:02 -08:00
|
|
|
strategy=PartitionStrategy.OCR_ONLY,
|
2023-10-30 13:13:29 -07:00
|
|
|
languages=["eng"],
|
|
|
|
is_image=is_image,
|
|
|
|
)
|
|
|
|
|
|
|
|
assert elements[0].metadata.languages == ["eng"]
|
|
|
|
# check pages
|
|
|
|
if is_image:
|
|
|
|
assert {el.metadata.page_number for el in elements} == {1}
|
|
|
|
else:
|
|
|
|
assert {el.metadata.page_number for el in elements} == {1, 2}
|
|
|
|
|
|
|
|
# check coordinates
|
|
|
|
for element in elements:
|
|
|
|
if element.metadata.coordinates:
|
|
|
|
for point in element.metadata.coordinates.points:
|
|
|
|
if point[0] and point[1]:
|
|
|
|
assert point[0] is not math.nan
|
|
|
|
assert point[1] is not math.nan
|
|
|
|
|
|
|
|
# check detection origin
|
|
|
|
if UNSTRUCTURED_INCLUDE_DEBUG_METADATA:
|
|
|
|
assert {element.metadata.detection_origin for element in elements} == {"ocr_tesseract"}
|
2023-11-09 23:14:06 -06:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_with_all_number_table_and_ocr_only_strategy():
|
|
|
|
# AttributeError was previously being raised when partitioning documents that contained only
|
2023-11-15 21:41:02 -08:00
|
|
|
# numerical values with `strategy=PartitionStrategy.OCR_ONLY`
|
2023-11-09 23:14:06 -06:00
|
|
|
filename = example_doc_path("all-number-table.pdf")
|
2023-11-15 21:41:02 -08:00
|
|
|
assert pdf.partition_pdf(filename, strategy=PartitionStrategy.OCR_ONLY)
|
2023-11-14 20:59:05 -05:00
|
|
|
|
|
|
|
|
|
|
|
# As of pdfminer 221105, this pdf throws an error and requires a workaround
|
|
|
|
# See #2059
|
|
|
|
def test_partition_pdf_with_bad_color_profile():
|
|
|
|
filename = example_doc_path("pdf-bad-color-space.pdf")
|
|
|
|
assert pdf.partition_pdf(filename, strategy="fast")
|
2023-11-29 14:00:15 -05:00
|
|
|
|
|
|
|
|
2024-02-23 11:56:09 -05:00
|
|
|
def test_partition_pdf_with_fast_finds_headers_footers(filename="example-docs/header-test-doc.pdf"):
|
|
|
|
elements = pdf.partition_pdf(filename, strategy="fast")
|
|
|
|
assert isinstance(elements[0], Header)
|
|
|
|
assert isinstance(elements[-1], Footer)
|
|
|
|
assert [element.text for element in elements] == [
|
|
|
|
"I Am A Header",
|
|
|
|
"Title",
|
|
|
|
"Here is a lovely sentences.",
|
|
|
|
"I Am A Footer",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2023-11-29 14:00:15 -05:00
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("filename", "expected_log"),
|
|
|
|
[
|
|
|
|
("invalid-pdf-structure-pdfminer-entire-doc.pdf", "Repairing the PDF document ..."),
|
|
|
|
("invalid-pdf-structure-pdfminer-one-page.pdf", "Repairing the PDF page 2 ..."),
|
|
|
|
],
|
|
|
|
)
|
|
|
|
def test_extractable_elements_repair_invalid_pdf_structure(filename, expected_log, caplog):
|
|
|
|
caplog.set_level(logging.INFO)
|
|
|
|
assert pdf.extractable_elements(filename=example_doc_path(filename))
|
|
|
|
assert expected_log in caplog.text
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
|
|
|
|
|
2024-01-04 09:52:00 -08:00
|
|
|
def assert_element_extraction(
|
|
|
|
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
|
|
|
):
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
extracted_elements = []
|
2024-01-04 09:52:00 -08:00
|
|
|
for el_type in extract_image_block_types:
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
extracted_elements_by_type = []
|
|
|
|
for el in elements:
|
|
|
|
if el.category == el_type:
|
|
|
|
extracted_elements_by_type.append(el)
|
|
|
|
extracted_elements.append(extracted_elements_by_type)
|
|
|
|
|
|
|
|
for extracted_elements_by_type in extracted_elements:
|
|
|
|
for i, el in enumerate(extracted_elements_by_type):
|
2024-01-04 09:52:00 -08:00
|
|
|
if extract_image_block_to_payload:
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
assert el.metadata.image_base64 is not None
|
|
|
|
assert el.metadata.image_mime_type == "image/jpeg"
|
|
|
|
image_data = base64.b64decode(el.metadata.image_base64)
|
|
|
|
assert isinstance(image_data, bytes)
|
|
|
|
assert el.metadata.image_path is None
|
|
|
|
else:
|
|
|
|
basename = "table" if el.category == ElementType.TABLE else "figure"
|
|
|
|
expected_image_path = os.path.join(
|
|
|
|
str(tmpdir), f"{basename}-{el.metadata.page_number}-{i + 1}.jpg"
|
|
|
|
)
|
|
|
|
assert el.metadata.image_path == expected_image_path
|
|
|
|
assert os.path.isfile(expected_image_path)
|
|
|
|
assert el.metadata.image_base64 is None
|
|
|
|
assert el.metadata.image_mime_type is None
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize("file_mode", ["filename", "rb"])
|
2024-01-04 09:52:00 -08:00
|
|
|
@pytest.mark.parametrize("extract_image_block_to_payload", [False, True])
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
def test_partition_pdf_element_extraction(
|
|
|
|
file_mode,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_to_payload,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
filename=example_doc_path("embedded-images-tables.pdf"),
|
|
|
|
):
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_types = ["Image", "Table"]
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
|
|
if file_mode == "filename":
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
2024-06-19 14:20:54 -04:00
|
|
|
# Image extraction shouldn't break by setting this
|
|
|
|
starting_page_number=20,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
|
|
extract_image_block_output_dir=tmpdir,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
)
|
|
|
|
else:
|
|
|
|
with open(filename, "rb") as f:
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
file=f,
|
2024-06-19 14:20:54 -04:00
|
|
|
# Image extraction shouldn't break by setting this
|
|
|
|
starting_page_number=20,
|
2024-01-04 09:52:00 -08:00
|
|
|
extract_image_block_types=extract_image_block_types,
|
|
|
|
extract_image_block_to_payload=extract_image_block_to_payload,
|
|
|
|
extract_image_block_output_dir=tmpdir,
|
Feat: return base64 encoded images for PDF's (#2310)
Closes #2302.
### Summary
- add functionality to get a Base64 encoded string from a PIL image
- store base64 encoded image data in two metadata fields: `image_base64`
and `image_mime_type`
- update the "image element filter" logic to keep all image elements in
the output if a user specifies image extraction
### Testing
```
from unstructured.partition.pdf import partition_pdf
elements = partition_pdf(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
extract_element_types=["Image", "Table"],
extract_to_payload=True,
)
```
or
```
from unstructured.partition.auto import partition
elements = partition(
filename="example-docs/embedded-images-tables.pdf",
strategy="hi_res",
pdf_extract_element_types=["Image", "Table"],
pdf_extract_to_payload=True,
)
```
2023-12-26 21:39:01 -08:00
|
|
|
)
|
2024-06-19 14:20:54 -04:00
|
|
|
assert elements[0].metadata.page_number == 20
|
2024-01-04 09:52:00 -08:00
|
|
|
assert_element_extraction(
|
|
|
|
elements, extract_image_block_types, extract_image_block_to_payload, tmpdir
|
|
|
|
)
|
2024-01-15 15:19:17 -08:00
|
|
|
|
|
|
|
|
|
|
|
def test_partition_pdf_always_keep_all_image_elements(
|
|
|
|
filename=example_doc_path("embedded-images.pdf"),
|
|
|
|
):
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
strategy="hi_res",
|
|
|
|
)
|
|
|
|
image_elements = [el for el in elements if el.category == ElementType.IMAGE]
|
|
|
|
assert len(image_elements) == 3
|
2024-04-24 09:05:20 +02:00
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def expected_element_ids_for_fast_strategy():
|
|
|
|
return [
|
|
|
|
"27a6cb3e5a4ad399b2f865729bbd3840",
|
|
|
|
"a90a54baba0093296a013d26b7acbc17",
|
|
|
|
"9be424e2d151dac4b5f36a85e9bbfe65",
|
|
|
|
"4631da875fb4996c63b2d80cea6b588e",
|
|
|
|
"6264f4eda97a049f4710f9bea0c01cbd",
|
|
|
|
"abded7b2ff3a5542c88b4a831755ec24",
|
|
|
|
"b781ea5123cb31e0571391b7b42cac75",
|
|
|
|
"033f27d2618ba4cda9068b267b5a731e",
|
|
|
|
"8982a12fcced30dd12ccbf61d14f30bf",
|
|
|
|
"41af2fd5df0cf47aa7e8ecca200d3ac6",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def expected_element_ids_for_hi_res_strategy():
|
|
|
|
return [
|
|
|
|
"27a6cb3e5a4ad399b2f865729bbd3840",
|
|
|
|
"a90a54baba0093296a013d26b7acbc17",
|
|
|
|
"9be424e2d151dac4b5f36a85e9bbfe65",
|
|
|
|
"4631da875fb4996c63b2d80cea6b588e",
|
|
|
|
"6264f4eda97a049f4710f9bea0c01cbd",
|
|
|
|
"abded7b2ff3a5542c88b4a831755ec24",
|
|
|
|
"b781ea5123cb31e0571391b7b42cac75",
|
|
|
|
"033f27d2618ba4cda9068b267b5a731e",
|
|
|
|
"8982a12fcced30dd12ccbf61d14f30bf",
|
|
|
|
"41af2fd5df0cf47aa7e8ecca200d3ac6",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def expected_element_ids_for_ocr_strategy():
|
|
|
|
return [
|
|
|
|
"272ab65cbe81795161128aea59599d83",
|
|
|
|
"b38affd7bbbb3dddf5c85ba8b14d380d",
|
|
|
|
"65903214d456b8b3cba6faa6714bd9ba",
|
|
|
|
"5b41ceae05dcfaeeac32ff8e82dc2ff1",
|
|
|
|
"6582fc6c6c595225feeddcc3263f0ae3",
|
|
|
|
"64b610c8f4274f1ce2175bf30814409d",
|
|
|
|
"8edde8bf2d3a68370dc4bd142c408ca4",
|
|
|
|
"a052bc17696043efce2e4f4f28393a83",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture()
|
|
|
|
def expected_ids(request):
|
|
|
|
return request.getfixturevalue(request.param)
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
|
|
("strategy", "expected_ids"),
|
|
|
|
[
|
|
|
|
(PartitionStrategy.FAST, "expected_element_ids_for_fast_strategy"),
|
|
|
|
(PartitionStrategy.HI_RES, "expected_element_ids_for_hi_res_strategy"),
|
|
|
|
(PartitionStrategy.OCR_ONLY, "expected_element_ids_for_ocr_strategy"),
|
|
|
|
],
|
|
|
|
indirect=["expected_ids"],
|
|
|
|
)
|
|
|
|
def test_unique_and_deterministic_element_ids(strategy, expected_ids):
|
|
|
|
elements = pdf.partition_pdf(
|
|
|
|
"example-docs/fake-memo-with-duplicate-page.pdf", strategy=strategy, starting_page_number=2
|
|
|
|
)
|
|
|
|
ids = [element.id for element in elements]
|
|
|
|
assert ids == expected_ids, "Element IDs do not match expected IDs"
|
2024-06-26 15:14:55 +02:00
|
|
|
|
|
|
|
|
|
|
|
def test_analysis_artifacts_saved():
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
|
|
filename = example_doc_path("layout-parser-paper-fast.pdf")
|
|
|
|
pdf.partition_pdf(
|
|
|
|
filename=filename,
|
|
|
|
strategy=PartitionStrategy.HI_RES,
|
|
|
|
analysis=True,
|
|
|
|
analyzed_image_output_dir_path=temp_dir,
|
|
|
|
)
|
|
|
|
|
|
|
|
analysis_dir = Path(temp_dir)
|
|
|
|
layout_dump_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "layout_dump"
|
|
|
|
assert layout_dump_dir.exists()
|
|
|
|
layout_dump_files = list(layout_dump_dir.iterdir())
|
|
|
|
assert len(layout_dump_files) == 1
|
|
|
|
assert (layout_dump_dir / "object_detection.json").exists()
|
|
|
|
|
|
|
|
bboxes_dir = analysis_dir / "analysis" / "layout-parser-paper-fast" / "bboxes"
|
|
|
|
assert bboxes_dir.exists()
|
|
|
|
bboxes_files = list(bboxes_dir.iterdir())
|
|
|
|
assert len(bboxes_files) == 2 * 4 # 2 pages * 4 different layouts per page
|
|
|
|
|
|
|
|
expected_layouts = ["od_model", "ocr", "pdfminer", "final"]
|
|
|
|
expected_pages = [1, 2]
|
|
|
|
for el in expected_layouts:
|
|
|
|
for page in expected_pages:
|
|
|
|
assert bboxes_dir / f"page{page}_layout_{el}.png" in bboxes_files
|