mirror of
https://github.com/docling-project/docling.git
synced 2025-06-27 05:20:05 +00:00
feat: Establish confidence estimation for document and pages (#1313)
* Establish confidence field, propagate layout confidence through Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add OCR confidence and parse confidence (stub) Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add parse quality rules, use 5% percentile for overall and parse scores Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Heuristic updates Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Fix garbage regex Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Move grade to page Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Introduce mean_score and low_score, consistent aggregate computations Signed-off-by: Christoph Auer <cau@zurich.ibm.com> * Add confidence test Signed-off-by: Christoph Auer <cau@zurich.ibm.com> --------- Signed-off-by: Christoph Auer <cau@zurich.ibm.com>
This commit is contained in:
parent
14d4f5b109
commit
90875247e5
@ -1,6 +1,9 @@
|
||||
import math
|
||||
from collections import defaultdict
|
||||
from enum import Enum
|
||||
from typing import TYPE_CHECKING, Dict, List, Optional, Union
|
||||
from typing import TYPE_CHECKING, Annotated, Dict, List, Literal, Optional, Union
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import (
|
||||
BoundingBox,
|
||||
DocItemLabel,
|
||||
@ -16,7 +19,7 @@ from docling_core.types.io import (
|
||||
DocumentStream,
|
||||
)
|
||||
from PIL.Image import Image
|
||||
from pydantic import BaseModel, ConfigDict
|
||||
from pydantic import BaseModel, ConfigDict, Field, computed_field
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from docling.backend.pdf_backend import PdfPageBackend
|
||||
@ -298,3 +301,97 @@ class OpenAiApiResponse(BaseModel):
|
||||
choices: List[OpenAiResponseChoice]
|
||||
created: int
|
||||
usage: OpenAiResponseUsage
|
||||
|
||||
|
||||
# Create a type alias for score values
|
||||
ScoreValue = float
|
||||
|
||||
|
||||
class QualityGrade(str, Enum):
|
||||
POOR = "poor"
|
||||
FAIR = "fair"
|
||||
GOOD = "good"
|
||||
EXCELLENT = "excellent"
|
||||
UNSPECIFIED = "unspecified"
|
||||
|
||||
|
||||
class PageConfidenceScores(BaseModel):
|
||||
parse_score: ScoreValue = np.nan
|
||||
layout_score: ScoreValue = np.nan
|
||||
table_score: ScoreValue = np.nan
|
||||
ocr_score: ScoreValue = np.nan
|
||||
|
||||
def _score_to_grade(self, score: ScoreValue) -> QualityGrade:
|
||||
if score < 0.5:
|
||||
return QualityGrade.POOR
|
||||
elif score < 0.8:
|
||||
return QualityGrade.FAIR
|
||||
elif score < 0.9:
|
||||
return QualityGrade.GOOD
|
||||
elif score >= 0.9:
|
||||
return QualityGrade.EXCELLENT
|
||||
|
||||
return QualityGrade.UNSPECIFIED
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def mean_grade(self) -> QualityGrade:
|
||||
return self._score_to_grade(self.mean_score)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def low_grade(self) -> QualityGrade:
|
||||
return self._score_to_grade(self.low_score)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def mean_score(self) -> ScoreValue:
|
||||
return ScoreValue(
|
||||
np.nanmean(
|
||||
[
|
||||
self.ocr_score,
|
||||
self.table_score,
|
||||
self.layout_score,
|
||||
self.parse_score,
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def low_score(self) -> ScoreValue:
|
||||
return ScoreValue(
|
||||
np.nanquantile(
|
||||
[
|
||||
self.ocr_score,
|
||||
self.table_score,
|
||||
self.layout_score,
|
||||
self.parse_score,
|
||||
],
|
||||
q=0.05,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class ConfidenceReport(PageConfidenceScores):
|
||||
pages: Dict[int, PageConfidenceScores] = Field(
|
||||
default_factory=lambda: defaultdict(PageConfidenceScores)
|
||||
)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def mean_score(self) -> ScoreValue:
|
||||
return ScoreValue(
|
||||
np.nanmean(
|
||||
[c.mean_score for c in self.pages.values()],
|
||||
)
|
||||
)
|
||||
|
||||
@computed_field # type: ignore
|
||||
@property
|
||||
def low_score(self) -> ScoreValue:
|
||||
return ScoreValue(
|
||||
np.nanmean(
|
||||
[c.low_score for c in self.pages.values()],
|
||||
)
|
||||
)
|
||||
|
@ -47,7 +47,7 @@ from docling_core.types.legacy_doc.document import (
|
||||
)
|
||||
from docling_core.utils.file import resolve_source_to_stream
|
||||
from docling_core.utils.legacy import docling_document_to_legacy
|
||||
from pydantic import BaseModel
|
||||
from pydantic import BaseModel, Field
|
||||
from typing_extensions import deprecated
|
||||
|
||||
from docling.backend.abstract_backend import (
|
||||
@ -56,6 +56,7 @@ from docling.backend.abstract_backend import (
|
||||
)
|
||||
from docling.datamodel.base_models import (
|
||||
AssembledUnit,
|
||||
ConfidenceReport,
|
||||
ConversionStatus,
|
||||
DocumentStream,
|
||||
ErrorItem,
|
||||
@ -201,6 +202,7 @@ class ConversionResult(BaseModel):
|
||||
pages: List[Page] = []
|
||||
assembled: AssembledUnit = AssembledUnit()
|
||||
timings: Dict[str, ProfilingItem] = {}
|
||||
confidence: ConfidenceReport = Field(default_factory=ConfidenceReport)
|
||||
|
||||
document: DoclingDocument = _EMPTY_DOCLING_DOC
|
||||
|
||||
|
@ -5,6 +5,7 @@ from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import DocItemLabel
|
||||
from docling_ibm_models.layoutmodel.layout_predictor import LayoutPredictor
|
||||
from PIL import Image
|
||||
@ -184,6 +185,14 @@ class LayoutModel(BasePageModel):
|
||||
).postprocess()
|
||||
# processed_clusters, processed_cells = clusters, page.cells
|
||||
|
||||
conv_res.confidence.pages[page.page_no].layout_score = float(
|
||||
np.mean([c.confidence for c in processed_clusters])
|
||||
)
|
||||
|
||||
conv_res.confidence.pages[page.page_no].ocr_score = float(
|
||||
np.mean([c.confidence for c in processed_cells if c.from_ocr])
|
||||
)
|
||||
|
||||
page.cells = processed_cells
|
||||
page.predictions.layout = LayoutPrediction(
|
||||
clusters=processed_clusters
|
||||
|
@ -3,6 +3,7 @@ import re
|
||||
from collections.abc import Iterable
|
||||
from typing import List
|
||||
|
||||
import numpy as np
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import (
|
||||
|
@ -1,11 +1,13 @@
|
||||
import re
|
||||
from collections.abc import Iterable
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import numpy as np
|
||||
from PIL import ImageDraw
|
||||
from pydantic import BaseModel
|
||||
|
||||
from docling.datamodel.base_models import Page
|
||||
from docling.datamodel.base_models import Page, ScoreValue
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.settings import settings
|
||||
from docling.models.base_model import BasePageModel
|
||||
@ -21,6 +23,14 @@ class PagePreprocessingModel(BasePageModel):
|
||||
def __init__(self, options: PagePreprocessingOptions):
|
||||
self.options = options
|
||||
|
||||
# Pre-compiled regex patterns for efficiency
|
||||
self.GLYPH_RE = re.compile(r"GLYPH<[0-9A-Fa-f]+>")
|
||||
self.SLASH_G_RE = re.compile(r"(?:/G\d+){2,}")
|
||||
self.FRAG_RE = re.compile(r"\b[A-Za-z](?:/[a-z]{1,3}\.[a-z]{1,3}){2,}\b")
|
||||
self.SLASH_NUMBER_GARBAGE_RE = re.compile(
|
||||
r"(?:/\w+\s*){2,}"
|
||||
) # Two or more "/token " sequences
|
||||
|
||||
def __call__(
|
||||
self, conv_res: ConversionResult, page_batch: Iterable[Page]
|
||||
) -> Iterable[Page]:
|
||||
@ -60,6 +70,18 @@ class PagePreprocessingModel(BasePageModel):
|
||||
if self.options.create_parsed_page:
|
||||
page.parsed_page = page._backend.get_segmented_page()
|
||||
|
||||
# Rate the text quality from the PDF parser, and aggregate on page
|
||||
text_scores = []
|
||||
for c in page.cells:
|
||||
score = self.rate_text_quality(c.text)
|
||||
text_scores.append(score)
|
||||
|
||||
conv_res.confidence.pages[page.page_no].parse_score = float(
|
||||
np.nanquantile(
|
||||
text_scores, q=0.10
|
||||
) # To emphasise problems in the parse_score, we take the 10% percentile score of all text cells.
|
||||
)
|
||||
|
||||
# DEBUG code:
|
||||
def draw_text_boxes(image, cells, show: bool = False):
|
||||
draw = ImageDraw.Draw(image)
|
||||
@ -88,3 +110,30 @@ class PagePreprocessingModel(BasePageModel):
|
||||
draw_text_boxes(page.get_image(scale=1.0), page.cells)
|
||||
|
||||
return page
|
||||
|
||||
def rate_text_quality(self, text: str) -> float:
|
||||
# Hard errors: if any of these patterns are found, return 0.0 immediately.
|
||||
blacklist_chars = ["<EFBFBD>"]
|
||||
if (
|
||||
any(text.find(c) >= 0 for c in blacklist_chars)
|
||||
or self.GLYPH_RE.search(text)
|
||||
or self.SLASH_G_RE.search(text)
|
||||
or self.SLASH_NUMBER_GARBAGE_RE.match(
|
||||
text
|
||||
) # Check if text is mostly slash-number pattern
|
||||
):
|
||||
return 0.0
|
||||
|
||||
penalty = 0.0
|
||||
|
||||
# Apply a penalty only if the fragmented words pattern occurs at least three times.
|
||||
frag_matches = self.FRAG_RE.findall(text)
|
||||
if len(frag_matches) >= 3:
|
||||
penalty += 0.1 * len(frag_matches)
|
||||
|
||||
# Additional heuristic: if the average token length is below 2, add a penalty.
|
||||
# tokens = text.split()
|
||||
# if tokens and (sum(map(len, tokens)) / len(tokens)) < 2:
|
||||
# penalty += 0.2
|
||||
|
||||
return max(1.0 - penalty, 0.0)
|
||||
|
@ -3,11 +3,12 @@ import warnings
|
||||
from pathlib import Path
|
||||
from typing import Optional, cast
|
||||
|
||||
import numpy as np
|
||||
from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
|
||||
|
||||
from docling.backend.abstract_backend import AbstractDocumentBackend
|
||||
from docling.backend.pdf_backend import PdfDocumentBackend
|
||||
from docling.datamodel.base_models import AssembledUnit, Page
|
||||
from docling.datamodel.base_models import AssembledUnit, Page, PageConfidenceScores
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import PdfPipelineOptions
|
||||
from docling.datamodel.settings import settings
|
||||
@ -60,7 +61,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
or self.pipeline_options.generate_table_images
|
||||
)
|
||||
|
||||
self.glm_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||
self.reading_order_model = ReadingOrderModel(options=ReadingOrderOptions())
|
||||
|
||||
ocr_model = self.get_ocr_model(artifacts_path=artifacts_path)
|
||||
|
||||
@ -197,7 +198,7 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
elements=all_elements, headers=all_headers, body=all_body
|
||||
)
|
||||
|
||||
conv_res.document = self.glm_model(conv_res)
|
||||
conv_res.document = self.reading_order_model(conv_res)
|
||||
|
||||
# Generate page images in the output
|
||||
if self.pipeline_options.generate_page_images:
|
||||
@ -244,6 +245,30 @@ class StandardPdfPipeline(PaginatedPipeline):
|
||||
cropped_im, dpi=int(72 * scale)
|
||||
)
|
||||
|
||||
# Aggregate confidence values for document:
|
||||
if len(conv_res.pages) > 0:
|
||||
conv_res.confidence.layout_score = float(
|
||||
np.nanmean(
|
||||
[c.layout_score for c in conv_res.confidence.pages.values()]
|
||||
)
|
||||
)
|
||||
conv_res.confidence.parse_score = float(
|
||||
np.nanquantile(
|
||||
[c.parse_score for c in conv_res.confidence.pages.values()],
|
||||
q=0.1, # parse score should relate to worst 10% of pages.
|
||||
)
|
||||
)
|
||||
conv_res.confidence.table_score = float(
|
||||
np.nanmean(
|
||||
[c.table_score for c in conv_res.confidence.pages.values()]
|
||||
)
|
||||
)
|
||||
conv_res.confidence.ocr_score = float(
|
||||
np.nanmean(
|
||||
[c.ocr_score for c in conv_res.confidence.pages.values()]
|
||||
)
|
||||
)
|
||||
|
||||
return conv_res
|
||||
|
||||
@classmethod
|
||||
|
@ -7,7 +7,7 @@ from docling.backend.docling_parse_backend import DoclingParseDocumentBackend
|
||||
from docling.backend.docling_parse_v2_backend import DoclingParseV2DocumentBackend
|
||||
from docling.backend.docling_parse_v4_backend import DoclingParseV4DocumentBackend
|
||||
from docling.backend.pypdfium2_backend import PyPdfiumDocumentBackend
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat
|
||||
from docling.datamodel.base_models import ConversionStatus, InputFormat, QualityGrade
|
||||
from docling.datamodel.document import ConversionResult
|
||||
from docling.datamodel.pipeline_options import (
|
||||
AcceleratorDevice,
|
||||
@ -163,3 +163,11 @@ def test_parser_backends(test_doc_path):
|
||||
doc_result: ConversionResult = converter.convert(test_doc_path)
|
||||
|
||||
assert doc_result.status == ConversionStatus.SUCCESS
|
||||
|
||||
|
||||
def test_confidence(test_doc_path):
|
||||
converter = DocumentConverter()
|
||||
doc_result: ConversionResult = converter.convert(test_doc_path, page_range=(6, 9))
|
||||
|
||||
assert doc_result.confidence.mean_grade == QualityGrade.EXCELLENT
|
||||
assert doc_result.confidence.low_grade == QualityGrade.EXCELLENT
|
||||
|
Loading…
x
Reference in New Issue
Block a user