docling/tests/test_backend_patent_uspto.py

459 lines
20 KiB
Python
Raw Normal View History

"""Test methods in module docling.backend.patent_uspto_backend.py."""
import logging
import os
from pathlib import Path
from tempfile import NamedTemporaryFile
import pytest
from docling_core.types import DoclingDocument
from docling_core.types.doc import DocItemLabel, TableData, TextItem
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import InputDocument
feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905) * Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-03-18 10:38:19 +01:00
from .test_data_gen_flag import GEN_TEST_DATA
from .verify_utils import verify_document
feat: Add DoclingParseV4 backend, using high-level docling-parse API (#905) * Add DoclingParseV3 backend implementation Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Use docling-core with docling-parse types Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes and test updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix streams Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset tests Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test cases Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * update test units Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add back DoclingParse v1 backend, pipeline options Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update locks Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * fix: update docling-core to 2.22.0 Update dependency library docling-core to latest release 2.22.0 Fix regression tests and ground truth files Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> * Ground-truth files updated Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Update tests, use TextCell.from_ocr property Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Text fixes, new test data Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Rename docling backend to v4 Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Test all backends, fixes Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Reset all tests to use docling-parse v1 for now Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fixes for DPv4 backend init, better test coverage Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * test_input_doc use default backend Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com> Signed-off-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com> Co-authored-by: Cesar Berrospi Ramis <75900930+ceberam@users.noreply.github.com>
2025-03-18 10:38:19 +01:00
GENERATE: bool = GEN_TEST_DATA
DATA_PATH: Path = Path("./tests/data/uspto/")
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/")
def _generate_groundtruth(doc: DoclingDocument, file_stem: str) -> None:
with open(GT_PATH / f"{file_stem}.itxt", "w", encoding="utf-8") as file_obj:
file_obj.write(doc._export_to_indented_text())
doc.save_as_json(GT_PATH / f"{file_stem}.json")
doc.save_as_markdown(GT_PATH / f"{file_stem}.md")
@pytest.fixture(scope="module")
def patents() -> list[tuple[Path, DoclingDocument]]:
patent_paths = (
sorted(DATA_PATH.glob("ip*.xml"))
+ sorted(DATA_PATH.glob("pg*.xml"))
+ sorted(DATA_PATH.glob("pa*.xml"))
+ sorted(DATA_PATH.glob("pftaps*.txt"))
)
patents: list[dict[Path, DoclingDocument]] = []
for in_path in patent_paths:
in_doc = InputDocument(
path_or_stream=in_path,
format=InputFormat.XML_USPTO,
backend=PatentUsptoDocumentBackend,
)
backend = PatentUsptoDocumentBackend(in_doc=in_doc, path_or_stream=in_path)
logging.info(f"Converting patent from file {in_path}")
doc = backend.convert()
assert doc, f"Failed to parse document {in_path}"
patents.append((in_path, doc))
return patents
@pytest.fixture(scope="module")
def groundtruth() -> list[tuple[Path, str]]:
patent_paths = (
sorted(GT_PATH.glob("ip*"))
+ sorted(GT_PATH.glob("pg*"))
+ sorted(GT_PATH.glob("pa*"))
+ sorted(GT_PATH.glob("pftaps*"))
)
groundtruth: list[tuple[Path, str]] = []
for in_path in patent_paths:
with open(in_path, encoding="utf-8") as file_obj:
content = file_obj.read()
groundtruth.append((in_path, content))
return groundtruth
@pytest.fixture(scope="module")
def tables() -> list[tuple[Path, TableData]]:
table_paths = sorted(DATA_PATH.glob("tables*.xml"))
tables: list[tuple[Path, TableData]] = []
for in_path in table_paths:
with open(in_path, encoding="utf-8") as file_obj:
content = file_obj.read()
parser = XmlTable(content)
parsed_table = parser.parse()
assert parsed_table
tables.append((in_path, parsed_table))
return tables
@pytest.mark.skip("Slow test")
def test_patent_export(patents):
for _, doc in patents:
with NamedTemporaryFile(suffix=".yaml", delete=False) as tmp_file:
doc.save_as_yaml(Path(tmp_file.name))
assert os.path.getsize(tmp_file.name) > 0
with NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
doc.save_as_html(Path(tmp_file.name))
assert os.path.getsize(tmp_file.name) > 0
with NamedTemporaryFile(suffix=".md", delete=False) as tmp_file:
doc.save_as_markdown(Path(tmp_file.name))
assert os.path.getsize(tmp_file.name) > 0
def test_patent_groundtruth(patents, groundtruth):
gt_stems: list[str] = [item[0].stem for item in groundtruth]
gt_names: dict[str, str] = {item[0].name: item[1] for item in groundtruth}
for path, doc in patents:
if path.stem not in gt_stems:
continue
md_name = path.stem + ".md"
if md_name in gt_names:
pred_md = doc.export_to_markdown()
assert pred_md == gt_names[md_name], (
f"Markdown file mismatch against groundtruth {md_name}"
)
json_path = path.with_suffix(".json")
if json_path.stem in gt_names:
assert verify_document(doc, str(json_path), GENERATE), (
f"JSON file mismatch against groundtruth {json_path}"
)
itxt_name = path.stem + ".itxt"
if itxt_name in gt_names:
pred_itxt = doc._export_to_indented_text()
assert pred_itxt == gt_names[itxt_name], (
f"Indented text file mismatch against groundtruth {itxt_name}"
)
def test_tables(tables):
"""Test the table parser."""
# CHECK table in file tables_20180000016.xml
file_name = "tables_ipa20180000016.xml"
file_table = next(item[1] for item in tables if item[0].name == file_name)
assert file_table.num_rows == 13
assert file_table.num_cols == 10
assert len(file_table.table_cells) == 130
def test_patent_uspto_ice(patents):
"""Test applications and grants Full Text Data/XML Version 4.x ICE."""
# CHECK application doc number 20200022300
file_name = "ipa20200022300.xml"
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
assert doc.name == file_name
texts = doc.texts
assert len(texts) == 78
assert isinstance(texts[0], TextItem)
assert (
texts[0].text
== "SYSTEM FOR CONTROLLING THE OPERATION OF AN ACTUATOR MOUNTED ON A SEED PLANTING IMPLEMENT"
)
assert texts[0].label == DocItemLabel.TITLE
assert texts[0].parent.cref == "#/body"
assert isinstance(texts[1], TextItem)
assert texts[1].text == "ABSTRACT"
assert texts[1].label == DocItemLabel.SECTION_HEADER
assert texts[1].parent.cref == "#/texts/0"
assert isinstance(texts[2], TextItem)
assert texts[2].text == (
"In one aspect, a system for controlling an operation of an actuator mounted "
"on a seed planting implement may include an actuator configured to adjust a "
"position of a row unit of the seed planting implement relative to a toolbar "
"of the seed planting implement. The system may also include a flow restrictor"
" fluidly coupled to a fluid chamber of the actuator, with the flow restrictor"
" being configured to reduce a rate at which fluid is permitted to exit the "
"fluid chamber in a manner that provides damping to the row unit. Furthermore,"
" the system may include a valve fluidly coupled to the flow restrictor in a "
"parallel relationship such that the valve is configured to permit the fluid "
"exiting the fluid chamber to flow through the flow restrictor and the fluid "
"entering the fluid chamber to bypass the flow restrictor."
)
assert texts[2].label == DocItemLabel.PARAGRAPH
assert texts[2].parent.cref == "#/texts/1"
assert isinstance(texts[3], TextItem)
assert texts[3].text == "FIELD"
assert texts[3].label == DocItemLabel.SECTION_HEADER
assert texts[3].parent.cref == "#/texts/0"
assert isinstance(texts[4], TextItem)
assert texts[4].text == (
"The present disclosure generally relates to seed planting implements and, "
"more particularly, to systems for controlling the operation of an actuator "
"mounted on a seed planting implement in a manner that provides damping to "
"one or more components of the seed planting implement."
)
assert texts[4].label == DocItemLabel.PARAGRAPH
assert texts[4].parent.cref == "#/texts/3"
assert isinstance(texts[5], TextItem)
assert texts[5].text == "BACKGROUND"
assert texts[5].label == DocItemLabel.SECTION_HEADER
assert texts[5].parent.cref == "#/texts/0"
assert isinstance(texts[6], TextItem)
assert texts[6].text == (
"Modern farming practices strive to increase yields of agricultural fields. In"
" this respect, seed planting implements are towed behind a tractor or other "
"work vehicle to deposit seeds in a field. For example, seed planting "
"implements typically include one or more ground engaging tools or openers "
"that form a furrow or trench in the soil. One or more dispensing devices of "
"the seed planting implement may, in turn, deposit seeds into the furrow(s). "
"After deposition of the seeds, a packer wheel may pack the soil on top of the"
" deposited seeds."
)
assert texts[6].label == DocItemLabel.PARAGRAPH
assert texts[6].parent.cref == "#/texts/5"
assert isinstance(texts[7], TextItem)
assert texts[7].text == (
"In certain instances, the packer wheel may also control the penetration depth"
" of the furrow. In this regard, the position of the packer wheel may be moved"
" vertically relative to the associated opener(s) to adjust the depth of the "
"furrow. Additionally, the seed planting implement includes an actuator "
"configured to exert a downward force on the opener(s) to ensure that the "
"opener(s) is able to penetrate the soil to the depth set by the packer wheel."
" However, the seed planting implement may bounce or chatter when traveling at"
" high speeds and/or when the opener(s) encounters hard or compacted soil. As "
"such, operators generally operate the seed planting implement with the "
"actuator exerting more downward force on the opener(s) than is necessary in "
"order to prevent such bouncing or chatter. Operation of the seed planting "
"implement with excessive down pressure applied to the opener(s), however, "
"reduces the overall stability of the seed planting implement."
)
assert texts[7].label == DocItemLabel.PARAGRAPH
assert texts[7].parent.cref == "#/texts/5"
assert isinstance(texts[8], TextItem)
assert texts[8].text == (
"Accordingly, an improved system for controlling the operation of an actuator "
"mounted on s seed planting implement to enhance the overall operation of the "
"implement would be welcomed in the technology."
)
assert texts[8].label == DocItemLabel.PARAGRAPH
assert texts[8].parent.cref == "#/texts/5"
assert isinstance(texts[9], TextItem)
assert texts[9].text == "BRIEF DESCRIPTION"
assert texts[9].label == DocItemLabel.SECTION_HEADER
assert texts[9].parent.cref == "#/texts/0"
assert isinstance(texts[15], TextItem)
assert texts[15].text == "BRIEF DESCRIPTION OF THE DRAWINGS"
assert texts[15].label == DocItemLabel.SECTION_HEADER
assert texts[15].parent.cref == "#/texts/0"
assert isinstance(texts[17], TextItem)
assert texts[17].text == (
"FIG. 1 illustrates a perspective view of one embodiment of a seed planting "
"implement in accordance with aspects of the present subject matter;"
)
assert texts[17].label == DocItemLabel.PARAGRAPH
assert texts[17].parent.cref == "#/texts/15"
assert isinstance(texts[27], TextItem)
assert texts[27].text == "DETAILED DESCRIPTION"
assert texts[27].label == DocItemLabel.SECTION_HEADER
assert texts[27].parent.cref == "#/texts/0"
assert isinstance(texts[57], TextItem)
assert texts[57].text == (
"This written description uses examples to disclose the technology, including "
"the best mode, and also to enable any person skilled in the art to practice "
"the technology, including making and using any devices or systems and "
"performing any incorporated methods. The patentable scope of the technology "
"is defined by the claims, and may include other examples that occur to those "
"skilled in the art. Such other examples are intended to be within the scope "
"of the claims if they include structural elements that do not differ from the"
" literal language of the claims, or if they include equivalent structural "
"elements with insubstantial differences from the literal language of the "
"claims."
)
assert texts[57].label == DocItemLabel.PARAGRAPH
assert texts[57].parent.cref == "#/texts/27"
assert isinstance(texts[58], TextItem)
assert texts[58].text == "CLAIMS"
assert texts[58].label == DocItemLabel.SECTION_HEADER
assert texts[58].parent.cref == "#/texts/0"
assert isinstance(texts[77], TextItem)
assert texts[77].text == (
"19. The system of claim 18, wherein the flow restrictor and the valve are "
"fluidly coupled in a parallel relationship."
)
assert texts[77].label == DocItemLabel.PARAGRAPH
assert texts[77].parent.cref == "#/texts/58"
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
file_name = "ipa20180000016.xml"
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
assert doc.name == file_name
texts = doc.texts
assert len(texts) == 183
assert isinstance(texts[0], TextItem)
assert texts[0].text == "LIGHT EMITTING DEVICE AND PLANT CULTIVATION METHOD"
assert texts[0].label == DocItemLabel.TITLE
assert texts[0].parent.cref == "#/body"
assert isinstance(texts[1], TextItem)
assert texts[1].text == "ABSTRACT"
assert texts[1].label == DocItemLabel.SECTION_HEADER
assert texts[1].parent.cref == "#/texts/0"
assert isinstance(texts[2], TextItem)
assert texts[2].text == (
"Provided is a light emitting device that includes a light emitting element "
"having a light emission peak wavelength ranging from 380 nm to 490 nm, and a "
"fluorescent material excited by light from the light emitting element and "
"emitting light having at a light emission peak wavelength ranging from 580 nm"
" or more to less than 680 nm. The light emitting device emits light having a "
"ratio R/B of a photon flux density R to a photon flux density B ranging from "
"2.0 to 4.0 and a ratio R/FR of the photon flux density R to a photon flux "
"density FR ranging from 0.7 to 13.0, the photon flux density R being in a "
"wavelength range of 620 nm or more and less than 700 nm, the photon flux "
"density B being in a wavelength range of 380 nm or more and 490 nm or less, "
"and the photon flux density FR being in a wavelength range of 700 nm or more "
"and 780 nm or less."
)
assert isinstance(texts[3], TextItem)
assert texts[3].text == "CROSS-REFERENCE TO RELATED APPLICATION"
assert texts[3].label == DocItemLabel.SECTION_HEADER
assert texts[3].parent.cref == "#/texts/0"
assert isinstance(texts[4], TextItem)
assert texts[5].text == "BACKGROUND"
assert texts[5].label == DocItemLabel.SECTION_HEADER
assert texts[5].parent.cref == "#/texts/0"
assert isinstance(texts[6], TextItem)
assert texts[6].text == "Technical Field"
assert texts[6].label == DocItemLabel.SECTION_HEADER
assert texts[6].parent.cref == "#/texts/0"
assert isinstance(texts[7], TextItem)
assert texts[7].text == (
"The present disclosure relates to a light emitting device and a plant "
"cultivation method."
)
assert texts[7].label == DocItemLabel.PARAGRAPH
assert texts[7].parent.cref == "#/texts/6"
assert isinstance(texts[8], TextItem)
assert texts[8].text == "Description of Related Art"
assert texts[8].label == DocItemLabel.SECTION_HEADER
assert texts[8].parent.cref == "#/texts/0"
assert isinstance(texts[63], TextItem)
assert texts[63].text == (
"wherein r, s, and t are numbers satisfying 0≦r≦1.0, 0≦s≦1.0, 0<t<1.0, and "
"r+s+t≦1.0."
)
assert texts[63].label == DocItemLabel.PARAGRAPH
assert texts[63].parent.cref == "#/texts/51"
assert isinstance(texts[89], TextItem)
assert texts[89].text == (
"Examples of the compound containing Al, Ga, or In specifically include Al₂O₃, "
"Ga₂O₃, and In₂O₃."
)
assert texts[89].label == DocItemLabel.PARAGRAPH
assert texts[89].parent.cref == "#/texts/87"
# CHECK application doc number 20110039701 for complex long tables
file_name = "ipa20110039701.xml"
doc = next(item[1] for item in patents if item[0].name == file_name)
assert doc.name == file_name
assert len(doc.tables) == 17
def test_patent_uspto_grant_v2(patents):
"""Test applications and grants Full Text Data/APS."""
# CHECK application doc number 06442728
file_name = "pg06442728.xml"
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
assert doc.name == file_name
texts = doc.texts
assert len(texts) == 108
assert isinstance(texts[0], TextItem)
assert texts[0].text == "Methods and apparatus for turbo code"
assert texts[0].label == DocItemLabel.TITLE
assert texts[0].parent.cref == "#/body"
assert isinstance(texts[1], TextItem)
assert texts[1].text == "ABSTRACT"
assert texts[1].label == DocItemLabel.SECTION_HEADER
assert texts[1].parent.cref == "#/texts/0"
assert isinstance(texts[2], TextItem)
assert texts[2].text == (
"An interleaver receives incoming data frames of size N. The interleaver "
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
"then effectively rearranges (permutes) the data by permuting the rows of the "
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
"equal to N₂, βj is a constant which may be different for each row, and each "
"αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
"interleaver outputs the data in a different order than received (e.g., "
"receives sequentially row by row, outputs sequentially each column by column)."
)
# check that the formula has been skipped
assert texts[43].text == (
"Calculating the specified equation with the specified values for permuting "
"row 0 of array D 350 into row 0 of array D₁ 360 proceeds as:"
)
assert texts[44].text == (
"and the permuted data frame is contained in array D₁ 360 shown in FIG. 3. "
"Outputting the array column by column outputs the frame elements in the "
"order:"
)
def test_patent_uspto_app_v1(patents):
"""Test applications Full Text Data/XML Version 1.x."""
# CHECK application doc number 20010031492
file_name = "pa20010031492.xml"
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
assert doc.name == file_name
texts = doc.texts
assert len(texts) == 103
assert isinstance(texts[0], TextItem)
assert texts[0].text == "Assay reagent"
assert texts[0].label == DocItemLabel.TITLE
assert texts[0].parent.cref == "#/body"
assert isinstance(texts[1], TextItem)
assert texts[1].text == "ABSTRACT"
assert texts[1].label == DocItemLabel.SECTION_HEADER
assert texts[1].parent.cref == "#/texts/0"
# check that the formula has been skipped
assert texts[62].text == (
"5. The % toxic effect for each sample was calculated as follows:"
)
assert texts[63].text == "where: Cₒ=light in control at time zero"
assert len(doc.tables) == 1
assert doc.tables[0].data.num_rows == 6
assert doc.tables[0].data.num_cols == 3
def test_patent_uspto_grant_aps(patents):
"""Test applications Full Text Data/APS."""
# CHECK application doc number 057006474
file_name = "pftaps057006474.txt"
doc = next(item[1] for item in patents if item[0].name == file_name)
if GENERATE:
_generate_groundtruth(doc, Path(file_name).stem)
assert doc.name == file_name
texts = doc.texts
assert len(texts) == 75
assert isinstance(texts[0], TextItem)
assert texts[0].text == "Carbocation containing cyanine-type dye"
assert texts[0].label == DocItemLabel.TITLE
assert texts[0].parent.cref == "#/body"
assert isinstance(texts[1], TextItem)
assert texts[1].text == "ABSTRACT"
assert texts[1].label == DocItemLabel.SECTION_HEADER
assert texts[1].parent.cref == "#/texts/0"
assert isinstance(texts[2], TextItem)
assert texts[2].text == (
"To provide a reagent with excellent stability under storage, which can detect"
" a subject compound to be measured with higher specificity and sensitibity. "
"Complexes of a compound represented by the general formula (IV):"
)
assert len(doc.tables) == 0
for item in texts:
assert "##STR1##" not in item.text