2024-12-17 16:35:23 +01:00
|
|
|
|
"""Test methods in module docling.backend.patent_uspto_backend.py."""
|
|
|
|
|
|
|
|
|
|
import logging
|
|
|
|
|
import os
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
from tempfile import NamedTemporaryFile
|
|
|
|
|
|
|
|
|
|
import pytest
|
|
|
|
|
from docling_core.types import DoclingDocument
|
|
|
|
|
from docling_core.types.doc import DocItemLabel, TableData, TextItem
|
|
|
|
|
|
|
|
|
|
from docling.backend.xml.uspto_backend import PatentUsptoDocumentBackend, XmlTable
|
|
|
|
|
from docling.datamodel.base_models import InputFormat
|
2025-01-26 08:10:33 +01:00
|
|
|
|
from docling.datamodel.document import InputDocument
|
2024-12-17 16:35:23 +01:00
|
|
|
|
|
2025-03-18 10:38:19 +01:00
|
|
|
|
from .test_data_gen_flag import GEN_TEST_DATA
|
2025-02-20 16:20:07 +01:00
|
|
|
|
from .verify_utils import verify_document
|
|
|
|
|
|
2025-03-18 10:38:19 +01:00
|
|
|
|
GENERATE: bool = GEN_TEST_DATA
|
2024-12-17 16:35:23 +01:00
|
|
|
|
DATA_PATH: Path = Path("./tests/data/uspto/")
|
|
|
|
|
GT_PATH: Path = Path("./tests/data/groundtruth/docling_v2/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _generate_groundtruth(doc: DoclingDocument, file_stem: str) -> None:
|
|
|
|
|
with open(GT_PATH / f"{file_stem}.itxt", "w", encoding="utf-8") as file_obj:
|
|
|
|
|
file_obj.write(doc._export_to_indented_text())
|
|
|
|
|
doc.save_as_json(GT_PATH / f"{file_stem}.json")
|
|
|
|
|
doc.save_as_markdown(GT_PATH / f"{file_stem}.md")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
|
def patents() -> list[tuple[Path, DoclingDocument]]:
|
|
|
|
|
patent_paths = (
|
|
|
|
|
sorted(DATA_PATH.glob("ip*.xml"))
|
|
|
|
|
+ sorted(DATA_PATH.glob("pg*.xml"))
|
|
|
|
|
+ sorted(DATA_PATH.glob("pa*.xml"))
|
|
|
|
|
+ sorted(DATA_PATH.glob("pftaps*.txt"))
|
|
|
|
|
)
|
|
|
|
|
patents: list[dict[Path, DoclingDocument]] = []
|
|
|
|
|
for in_path in patent_paths:
|
|
|
|
|
in_doc = InputDocument(
|
|
|
|
|
path_or_stream=in_path,
|
|
|
|
|
format=InputFormat.XML_USPTO,
|
|
|
|
|
backend=PatentUsptoDocumentBackend,
|
|
|
|
|
)
|
|
|
|
|
backend = PatentUsptoDocumentBackend(in_doc=in_doc, path_or_stream=in_path)
|
|
|
|
|
logging.info(f"Converting patent from file {in_path}")
|
|
|
|
|
doc = backend.convert()
|
|
|
|
|
assert doc, f"Failed to parse document {in_path}"
|
|
|
|
|
patents.append((in_path, doc))
|
|
|
|
|
|
|
|
|
|
return patents
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
|
def groundtruth() -> list[tuple[Path, str]]:
|
|
|
|
|
patent_paths = (
|
|
|
|
|
sorted(GT_PATH.glob("ip*"))
|
|
|
|
|
+ sorted(GT_PATH.glob("pg*"))
|
|
|
|
|
+ sorted(GT_PATH.glob("pa*"))
|
|
|
|
|
+ sorted(GT_PATH.glob("pftaps*"))
|
|
|
|
|
)
|
|
|
|
|
groundtruth: list[tuple[Path, str]] = []
|
|
|
|
|
for in_path in patent_paths:
|
|
|
|
|
with open(in_path, encoding="utf-8") as file_obj:
|
|
|
|
|
content = file_obj.read()
|
|
|
|
|
groundtruth.append((in_path, content))
|
|
|
|
|
|
|
|
|
|
return groundtruth
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
|
|
|
def tables() -> list[tuple[Path, TableData]]:
|
|
|
|
|
table_paths = sorted(DATA_PATH.glob("tables*.xml"))
|
|
|
|
|
tables: list[tuple[Path, TableData]] = []
|
|
|
|
|
for in_path in table_paths:
|
|
|
|
|
with open(in_path, encoding="utf-8") as file_obj:
|
|
|
|
|
content = file_obj.read()
|
|
|
|
|
parser = XmlTable(content)
|
|
|
|
|
parsed_table = parser.parse()
|
|
|
|
|
assert parsed_table
|
|
|
|
|
tables.append((in_path, parsed_table))
|
|
|
|
|
|
|
|
|
|
return tables
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@pytest.mark.skip("Slow test")
|
|
|
|
|
def test_patent_export(patents):
|
|
|
|
|
for _, doc in patents:
|
|
|
|
|
with NamedTemporaryFile(suffix=".yaml", delete=False) as tmp_file:
|
|
|
|
|
doc.save_as_yaml(Path(tmp_file.name))
|
|
|
|
|
assert os.path.getsize(tmp_file.name) > 0
|
|
|
|
|
|
|
|
|
|
with NamedTemporaryFile(suffix=".html", delete=False) as tmp_file:
|
|
|
|
|
doc.save_as_html(Path(tmp_file.name))
|
|
|
|
|
assert os.path.getsize(tmp_file.name) > 0
|
|
|
|
|
|
|
|
|
|
with NamedTemporaryFile(suffix=".md", delete=False) as tmp_file:
|
|
|
|
|
doc.save_as_markdown(Path(tmp_file.name))
|
|
|
|
|
assert os.path.getsize(tmp_file.name) > 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_patent_groundtruth(patents, groundtruth):
|
|
|
|
|
gt_stems: list[str] = [item[0].stem for item in groundtruth]
|
|
|
|
|
gt_names: dict[str, str] = {item[0].name: item[1] for item in groundtruth}
|
|
|
|
|
for path, doc in patents:
|
|
|
|
|
if path.stem not in gt_stems:
|
|
|
|
|
continue
|
|
|
|
|
md_name = path.stem + ".md"
|
|
|
|
|
if md_name in gt_names:
|
|
|
|
|
pred_md = doc.export_to_markdown()
|
2025-04-14 18:01:26 +02:00
|
|
|
|
assert pred_md == gt_names[md_name], (
|
|
|
|
|
f"Markdown file mismatch against groundtruth {md_name}"
|
|
|
|
|
)
|
2025-02-20 16:20:07 +01:00
|
|
|
|
json_path = path.with_suffix(".json")
|
|
|
|
|
if json_path.stem in gt_names:
|
2025-04-14 18:01:26 +02:00
|
|
|
|
assert verify_document(doc, str(json_path), GENERATE), (
|
|
|
|
|
f"JSON file mismatch against groundtruth {json_path}"
|
|
|
|
|
)
|
2024-12-17 16:35:23 +01:00
|
|
|
|
itxt_name = path.stem + ".itxt"
|
|
|
|
|
if itxt_name in gt_names:
|
|
|
|
|
pred_itxt = doc._export_to_indented_text()
|
2025-04-14 18:01:26 +02:00
|
|
|
|
assert pred_itxt == gt_names[itxt_name], (
|
|
|
|
|
f"Indented text file mismatch against groundtruth {itxt_name}"
|
|
|
|
|
)
|
2024-12-17 16:35:23 +01:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_tables(tables):
|
|
|
|
|
"""Test the table parser."""
|
|
|
|
|
# CHECK table in file tables_20180000016.xml
|
|
|
|
|
file_name = "tables_ipa20180000016.xml"
|
2025-04-14 18:01:26 +02:00
|
|
|
|
file_table = next(item[1] for item in tables if item[0].name == file_name)
|
2024-12-17 16:35:23 +01:00
|
|
|
|
assert file_table.num_rows == 13
|
|
|
|
|
assert file_table.num_cols == 10
|
|
|
|
|
assert len(file_table.table_cells) == 130
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_patent_uspto_ice(patents):
|
|
|
|
|
"""Test applications and grants Full Text Data/XML Version 4.x ICE."""
|
|
|
|
|
|
|
|
|
|
# CHECK application doc number 20200022300
|
|
|
|
|
file_name = "ipa20200022300.xml"
|
2025-04-14 18:01:26 +02:00
|
|
|
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
2024-12-17 16:35:23 +01:00
|
|
|
|
if GENERATE:
|
|
|
|
|
_generate_groundtruth(doc, Path(file_name).stem)
|
|
|
|
|
|
|
|
|
|
assert doc.name == file_name
|
|
|
|
|
texts = doc.texts
|
|
|
|
|
assert len(texts) == 78
|
|
|
|
|
assert isinstance(texts[0], TextItem)
|
|
|
|
|
assert (
|
|
|
|
|
texts[0].text
|
|
|
|
|
== "SYSTEM FOR CONTROLLING THE OPERATION OF AN ACTUATOR MOUNTED ON A SEED PLANTING IMPLEMENT"
|
|
|
|
|
)
|
|
|
|
|
assert texts[0].label == DocItemLabel.TITLE
|
|
|
|
|
assert texts[0].parent.cref == "#/body"
|
|
|
|
|
assert isinstance(texts[1], TextItem)
|
|
|
|
|
assert texts[1].text == "ABSTRACT"
|
|
|
|
|
assert texts[1].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[1].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[2], TextItem)
|
|
|
|
|
assert texts[2].text == (
|
|
|
|
|
"In one aspect, a system for controlling an operation of an actuator mounted "
|
|
|
|
|
"on a seed planting implement may include an actuator configured to adjust a "
|
|
|
|
|
"position of a row unit of the seed planting implement relative to a toolbar "
|
|
|
|
|
"of the seed planting implement. The system may also include a flow restrictor"
|
|
|
|
|
" fluidly coupled to a fluid chamber of the actuator, with the flow restrictor"
|
|
|
|
|
" being configured to reduce a rate at which fluid is permitted to exit the "
|
|
|
|
|
"fluid chamber in a manner that provides damping to the row unit. Furthermore,"
|
|
|
|
|
" the system may include a valve fluidly coupled to the flow restrictor in a "
|
|
|
|
|
"parallel relationship such that the valve is configured to permit the fluid "
|
|
|
|
|
"exiting the fluid chamber to flow through the flow restrictor and the fluid "
|
|
|
|
|
"entering the fluid chamber to bypass the flow restrictor."
|
|
|
|
|
)
|
|
|
|
|
assert texts[2].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[2].parent.cref == "#/texts/1"
|
|
|
|
|
assert isinstance(texts[3], TextItem)
|
|
|
|
|
assert texts[3].text == "FIELD"
|
|
|
|
|
assert texts[3].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[3].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[4], TextItem)
|
|
|
|
|
assert texts[4].text == (
|
|
|
|
|
"The present disclosure generally relates to seed planting implements and, "
|
|
|
|
|
"more particularly, to systems for controlling the operation of an actuator "
|
|
|
|
|
"mounted on a seed planting implement in a manner that provides damping to "
|
|
|
|
|
"one or more components of the seed planting implement."
|
|
|
|
|
)
|
|
|
|
|
assert texts[4].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[4].parent.cref == "#/texts/3"
|
|
|
|
|
assert isinstance(texts[5], TextItem)
|
|
|
|
|
assert texts[5].text == "BACKGROUND"
|
|
|
|
|
assert texts[5].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[5].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[6], TextItem)
|
|
|
|
|
assert texts[6].text == (
|
|
|
|
|
"Modern farming practices strive to increase yields of agricultural fields. In"
|
|
|
|
|
" this respect, seed planting implements are towed behind a tractor or other "
|
|
|
|
|
"work vehicle to deposit seeds in a field. For example, seed planting "
|
|
|
|
|
"implements typically include one or more ground engaging tools or openers "
|
|
|
|
|
"that form a furrow or trench in the soil. One or more dispensing devices of "
|
|
|
|
|
"the seed planting implement may, in turn, deposit seeds into the furrow(s). "
|
|
|
|
|
"After deposition of the seeds, a packer wheel may pack the soil on top of the"
|
|
|
|
|
" deposited seeds."
|
|
|
|
|
)
|
|
|
|
|
assert texts[6].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[6].parent.cref == "#/texts/5"
|
|
|
|
|
assert isinstance(texts[7], TextItem)
|
|
|
|
|
assert texts[7].text == (
|
|
|
|
|
"In certain instances, the packer wheel may also control the penetration depth"
|
|
|
|
|
" of the furrow. In this regard, the position of the packer wheel may be moved"
|
|
|
|
|
" vertically relative to the associated opener(s) to adjust the depth of the "
|
|
|
|
|
"furrow. Additionally, the seed planting implement includes an actuator "
|
|
|
|
|
"configured to exert a downward force on the opener(s) to ensure that the "
|
|
|
|
|
"opener(s) is able to penetrate the soil to the depth set by the packer wheel."
|
|
|
|
|
" However, the seed planting implement may bounce or chatter when traveling at"
|
|
|
|
|
" high speeds and/or when the opener(s) encounters hard or compacted soil. As "
|
|
|
|
|
"such, operators generally operate the seed planting implement with the "
|
|
|
|
|
"actuator exerting more downward force on the opener(s) than is necessary in "
|
|
|
|
|
"order to prevent such bouncing or chatter. Operation of the seed planting "
|
|
|
|
|
"implement with excessive down pressure applied to the opener(s), however, "
|
|
|
|
|
"reduces the overall stability of the seed planting implement."
|
|
|
|
|
)
|
|
|
|
|
assert texts[7].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[7].parent.cref == "#/texts/5"
|
|
|
|
|
assert isinstance(texts[8], TextItem)
|
|
|
|
|
assert texts[8].text == (
|
|
|
|
|
"Accordingly, an improved system for controlling the operation of an actuator "
|
|
|
|
|
"mounted on s seed planting implement to enhance the overall operation of the "
|
|
|
|
|
"implement would be welcomed in the technology."
|
|
|
|
|
)
|
|
|
|
|
assert texts[8].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[8].parent.cref == "#/texts/5"
|
|
|
|
|
assert isinstance(texts[9], TextItem)
|
|
|
|
|
assert texts[9].text == "BRIEF DESCRIPTION"
|
|
|
|
|
assert texts[9].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[9].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[15], TextItem)
|
|
|
|
|
assert texts[15].text == "BRIEF DESCRIPTION OF THE DRAWINGS"
|
|
|
|
|
assert texts[15].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[15].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[17], TextItem)
|
|
|
|
|
assert texts[17].text == (
|
|
|
|
|
"FIG. 1 illustrates a perspective view of one embodiment of a seed planting "
|
|
|
|
|
"implement in accordance with aspects of the present subject matter;"
|
|
|
|
|
)
|
|
|
|
|
assert texts[17].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[17].parent.cref == "#/texts/15"
|
|
|
|
|
assert isinstance(texts[27], TextItem)
|
|
|
|
|
assert texts[27].text == "DETAILED DESCRIPTION"
|
|
|
|
|
assert texts[27].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[27].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[57], TextItem)
|
|
|
|
|
assert texts[57].text == (
|
|
|
|
|
"This written description uses examples to disclose the technology, including "
|
|
|
|
|
"the best mode, and also to enable any person skilled in the art to practice "
|
|
|
|
|
"the technology, including making and using any devices or systems and "
|
|
|
|
|
"performing any incorporated methods. The patentable scope of the technology "
|
|
|
|
|
"is defined by the claims, and may include other examples that occur to those "
|
|
|
|
|
"skilled in the art. Such other examples are intended to be within the scope "
|
|
|
|
|
"of the claims if they include structural elements that do not differ from the"
|
|
|
|
|
" literal language of the claims, or if they include equivalent structural "
|
|
|
|
|
"elements with insubstantial differences from the literal language of the "
|
|
|
|
|
"claims."
|
|
|
|
|
)
|
|
|
|
|
assert texts[57].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[57].parent.cref == "#/texts/27"
|
|
|
|
|
assert isinstance(texts[58], TextItem)
|
|
|
|
|
assert texts[58].text == "CLAIMS"
|
|
|
|
|
assert texts[58].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[58].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[77], TextItem)
|
|
|
|
|
assert texts[77].text == (
|
|
|
|
|
"19. The system of claim 18, wherein the flow restrictor and the valve are "
|
|
|
|
|
"fluidly coupled in a parallel relationship."
|
|
|
|
|
)
|
|
|
|
|
assert texts[77].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[77].parent.cref == "#/texts/58"
|
|
|
|
|
|
|
|
|
|
# CHECK application doc number 20180000016 for HTML entities, level 2 headings, tables
|
|
|
|
|
file_name = "ipa20180000016.xml"
|
2025-04-14 18:01:26 +02:00
|
|
|
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
2024-12-17 16:35:23 +01:00
|
|
|
|
if GENERATE:
|
|
|
|
|
_generate_groundtruth(doc, Path(file_name).stem)
|
|
|
|
|
|
|
|
|
|
assert doc.name == file_name
|
|
|
|
|
texts = doc.texts
|
|
|
|
|
assert len(texts) == 183
|
|
|
|
|
assert isinstance(texts[0], TextItem)
|
|
|
|
|
assert texts[0].text == "LIGHT EMITTING DEVICE AND PLANT CULTIVATION METHOD"
|
|
|
|
|
assert texts[0].label == DocItemLabel.TITLE
|
|
|
|
|
assert texts[0].parent.cref == "#/body"
|
|
|
|
|
assert isinstance(texts[1], TextItem)
|
|
|
|
|
assert texts[1].text == "ABSTRACT"
|
|
|
|
|
assert texts[1].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[1].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[2], TextItem)
|
|
|
|
|
assert texts[2].text == (
|
|
|
|
|
"Provided is a light emitting device that includes a light emitting element "
|
|
|
|
|
"having a light emission peak wavelength ranging from 380 nm to 490 nm, and a "
|
|
|
|
|
"fluorescent material excited by light from the light emitting element and "
|
|
|
|
|
"emitting light having at a light emission peak wavelength ranging from 580 nm"
|
|
|
|
|
" or more to less than 680 nm. The light emitting device emits light having a "
|
|
|
|
|
"ratio R/B of a photon flux density R to a photon flux density B ranging from "
|
|
|
|
|
"2.0 to 4.0 and a ratio R/FR of the photon flux density R to a photon flux "
|
|
|
|
|
"density FR ranging from 0.7 to 13.0, the photon flux density R being in a "
|
|
|
|
|
"wavelength range of 620 nm or more and less than 700 nm, the photon flux "
|
|
|
|
|
"density B being in a wavelength range of 380 nm or more and 490 nm or less, "
|
|
|
|
|
"and the photon flux density FR being in a wavelength range of 700 nm or more "
|
|
|
|
|
"and 780 nm or less."
|
|
|
|
|
)
|
|
|
|
|
assert isinstance(texts[3], TextItem)
|
|
|
|
|
assert texts[3].text == "CROSS-REFERENCE TO RELATED APPLICATION"
|
|
|
|
|
assert texts[3].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[3].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[4], TextItem)
|
|
|
|
|
assert texts[5].text == "BACKGROUND"
|
|
|
|
|
assert texts[5].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[5].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[6], TextItem)
|
|
|
|
|
assert texts[6].text == "Technical Field"
|
|
|
|
|
assert texts[6].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[6].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[7], TextItem)
|
|
|
|
|
assert texts[7].text == (
|
|
|
|
|
"The present disclosure relates to a light emitting device and a plant "
|
|
|
|
|
"cultivation method."
|
|
|
|
|
)
|
|
|
|
|
assert texts[7].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[7].parent.cref == "#/texts/6"
|
|
|
|
|
assert isinstance(texts[8], TextItem)
|
|
|
|
|
assert texts[8].text == "Description of Related Art"
|
|
|
|
|
assert texts[8].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[8].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[63], TextItem)
|
|
|
|
|
assert texts[63].text == (
|
|
|
|
|
"wherein r, s, and t are numbers satisfying 0≦r≦1.0, 0≦s≦1.0, 0<t<1.0, and "
|
|
|
|
|
"r+s+t≦1.0."
|
|
|
|
|
)
|
|
|
|
|
assert texts[63].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[63].parent.cref == "#/texts/51"
|
|
|
|
|
assert isinstance(texts[89], TextItem)
|
|
|
|
|
assert texts[89].text == (
|
|
|
|
|
"Examples of the compound containing Al, Ga, or In specifically include Al₂O₃, "
|
|
|
|
|
"Ga₂O₃, and In₂O₃."
|
|
|
|
|
)
|
|
|
|
|
assert texts[89].label == DocItemLabel.PARAGRAPH
|
|
|
|
|
assert texts[89].parent.cref == "#/texts/87"
|
|
|
|
|
|
|
|
|
|
# CHECK application doc number 20110039701 for complex long tables
|
|
|
|
|
file_name = "ipa20110039701.xml"
|
2025-04-14 18:01:26 +02:00
|
|
|
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
2024-12-17 16:35:23 +01:00
|
|
|
|
assert doc.name == file_name
|
|
|
|
|
assert len(doc.tables) == 17
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_patent_uspto_grant_v2(patents):
|
|
|
|
|
"""Test applications and grants Full Text Data/APS."""
|
|
|
|
|
|
|
|
|
|
# CHECK application doc number 06442728
|
|
|
|
|
file_name = "pg06442728.xml"
|
2025-04-14 18:01:26 +02:00
|
|
|
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
2024-12-17 16:35:23 +01:00
|
|
|
|
if GENERATE:
|
|
|
|
|
_generate_groundtruth(doc, Path(file_name).stem)
|
|
|
|
|
|
|
|
|
|
assert doc.name == file_name
|
|
|
|
|
texts = doc.texts
|
|
|
|
|
assert len(texts) == 108
|
|
|
|
|
assert isinstance(texts[0], TextItem)
|
|
|
|
|
assert texts[0].text == "Methods and apparatus for turbo code"
|
|
|
|
|
assert texts[0].label == DocItemLabel.TITLE
|
|
|
|
|
assert texts[0].parent.cref == "#/body"
|
|
|
|
|
assert isinstance(texts[1], TextItem)
|
|
|
|
|
assert texts[1].text == "ABSTRACT"
|
|
|
|
|
assert texts[1].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[1].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[2], TextItem)
|
|
|
|
|
assert texts[2].text == (
|
|
|
|
|
"An interleaver receives incoming data frames of size N. The interleaver "
|
2025-04-14 18:01:26 +02:00
|
|
|
|
"indexes the elements of the frame with an N₁×N₂ index array. The interleaver " # noqa: RUF001
|
2024-12-17 16:35:23 +01:00
|
|
|
|
"then effectively rearranges (permutes) the data by permuting the rows of the "
|
2025-04-14 18:01:26 +02:00
|
|
|
|
"index array. The interleaver employs the equation I(j,k)=I(j,αjk+βj)modP) to " # noqa: RUF001
|
2024-12-17 16:35:23 +01:00
|
|
|
|
"permute the columns (indexed by k) of each row (indexed by j). P is at least "
|
|
|
|
|
"equal to N₂, βj is a constant which may be different for each row, and each "
|
2025-04-14 18:01:26 +02:00
|
|
|
|
"αj is a relative prime number relative to P. After permuting, the " # noqa: RUF001
|
2024-12-17 16:35:23 +01:00
|
|
|
|
"interleaver outputs the data in a different order than received (e.g., "
|
|
|
|
|
"receives sequentially row by row, outputs sequentially each column by column)."
|
|
|
|
|
)
|
|
|
|
|
# check that the formula has been skipped
|
|
|
|
|
assert texts[43].text == (
|
|
|
|
|
"Calculating the specified equation with the specified values for permuting "
|
|
|
|
|
"row 0 of array D 350 into row 0 of array D₁ 360 proceeds as:"
|
|
|
|
|
)
|
|
|
|
|
assert texts[44].text == (
|
|
|
|
|
"and the permuted data frame is contained in array D₁ 360 shown in FIG. 3. "
|
|
|
|
|
"Outputting the array column by column outputs the frame elements in the "
|
|
|
|
|
"order:"
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_patent_uspto_app_v1(patents):
|
|
|
|
|
"""Test applications Full Text Data/XML Version 1.x."""
|
|
|
|
|
|
|
|
|
|
# CHECK application doc number 20010031492
|
|
|
|
|
file_name = "pa20010031492.xml"
|
2025-04-14 18:01:26 +02:00
|
|
|
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
2024-12-17 16:35:23 +01:00
|
|
|
|
if GENERATE:
|
|
|
|
|
_generate_groundtruth(doc, Path(file_name).stem)
|
|
|
|
|
|
|
|
|
|
assert doc.name == file_name
|
|
|
|
|
texts = doc.texts
|
|
|
|
|
assert len(texts) == 103
|
|
|
|
|
assert isinstance(texts[0], TextItem)
|
|
|
|
|
assert texts[0].text == "Assay reagent"
|
|
|
|
|
assert texts[0].label == DocItemLabel.TITLE
|
|
|
|
|
assert texts[0].parent.cref == "#/body"
|
|
|
|
|
assert isinstance(texts[1], TextItem)
|
|
|
|
|
assert texts[1].text == "ABSTRACT"
|
|
|
|
|
assert texts[1].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[1].parent.cref == "#/texts/0"
|
|
|
|
|
# check that the formula has been skipped
|
|
|
|
|
assert texts[62].text == (
|
|
|
|
|
"5. The % toxic effect for each sample was calculated as follows:"
|
|
|
|
|
)
|
|
|
|
|
assert texts[63].text == "where: Cₒ=light in control at time zero"
|
|
|
|
|
assert len(doc.tables) == 1
|
|
|
|
|
assert doc.tables[0].data.num_rows == 6
|
|
|
|
|
assert doc.tables[0].data.num_cols == 3
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def test_patent_uspto_grant_aps(patents):
|
|
|
|
|
"""Test applications Full Text Data/APS."""
|
|
|
|
|
|
|
|
|
|
# CHECK application doc number 057006474
|
|
|
|
|
file_name = "pftaps057006474.txt"
|
2025-04-14 18:01:26 +02:00
|
|
|
|
doc = next(item[1] for item in patents if item[0].name == file_name)
|
2024-12-17 16:35:23 +01:00
|
|
|
|
if GENERATE:
|
|
|
|
|
_generate_groundtruth(doc, Path(file_name).stem)
|
|
|
|
|
|
|
|
|
|
assert doc.name == file_name
|
|
|
|
|
texts = doc.texts
|
|
|
|
|
assert len(texts) == 75
|
|
|
|
|
assert isinstance(texts[0], TextItem)
|
|
|
|
|
assert texts[0].text == "Carbocation containing cyanine-type dye"
|
|
|
|
|
assert texts[0].label == DocItemLabel.TITLE
|
|
|
|
|
assert texts[0].parent.cref == "#/body"
|
|
|
|
|
assert isinstance(texts[1], TextItem)
|
|
|
|
|
assert texts[1].text == "ABSTRACT"
|
|
|
|
|
assert texts[1].label == DocItemLabel.SECTION_HEADER
|
|
|
|
|
assert texts[1].parent.cref == "#/texts/0"
|
|
|
|
|
assert isinstance(texts[2], TextItem)
|
|
|
|
|
assert texts[2].text == (
|
|
|
|
|
"To provide a reagent with excellent stability under storage, which can detect"
|
|
|
|
|
" a subject compound to be measured with higher specificity and sensitibity. "
|
|
|
|
|
"Complexes of a compound represented by the general formula (IV):"
|
|
|
|
|
)
|
|
|
|
|
assert len(doc.tables) == 0
|
|
|
|
|
for item in texts:
|
|
|
|
|
assert "##STR1##" not in item.text
|