diff --git a/olmocr/pipeline.py b/olmocr/pipeline.py index 8dfc709..d8408bd 100644 --- a/olmocr/pipeline.py +++ b/olmocr/pipeline.py @@ -3,7 +3,6 @@ import asyncio import atexit import base64 import datetime -import glob import hashlib import json import logging diff --git a/olmocr/prompts/_adv_anchor.py b/olmocr/prompts/_adv_anchor.py deleted file mode 100644 index 6ba701e..0000000 --- a/olmocr/prompts/_adv_anchor.py +++ /dev/null @@ -1,596 +0,0 @@ -import math -from typing import ( - Any, - Callable, - Dict, - List, - Optional, - Tuple, - Union, - cast, -) - -from pypdf._cmap import build_char_map, unknown_char_map -from pypdf.constants import PageAttributes as PG -from pypdf.generic import ( - ContentStream, - DictionaryObject, - FloatObject, - NameObject, - NumberObject, - TextStringObject, - encode_pdfdocencoding, -) - -CUSTOM_RTL_MIN: int = -1 -CUSTOM_RTL_MAX: int = -1 -CUSTOM_RTL_SPECIAL_CHARS: List[int] = [] -LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5 - - -class OrientationNotFoundError(Exception): - pass - - -def set_custom_rtl( - _min: Union[str, int, None] = None, - _max: Union[str, int, None] = None, - specials: Union[str, List[int], None] = None, -) -> Tuple[int, int, List[int]]: - """ - Change the Right-To-Left and special characters custom parameters. - - Args: - _min: The new minimum value for the range of custom characters that - will be written right to left. - If set to ``None``, the value will not be changed. - If set to an integer or string, it will be converted to its ASCII code. - The default value is -1, which sets no additional range to be converted. - _max: The new maximum value for the range of custom characters that will - be written right to left. - If set to ``None``, the value will not be changed. - If set to an integer or string, it will be converted to its ASCII code. - The default value is -1, which sets no additional range to be converted. - specials: The new list of special characters to be inserted in the - current insertion order. - If set to ``None``, the current value will not be changed. - If set to a string, it will be converted to a list of ASCII codes. - The default value is an empty list. - - Returns: - A tuple containing the new values for ``CUSTOM_RTL_MIN``, - ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``. - """ - global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS - if isinstance(_min, int): - CUSTOM_RTL_MIN = _min - elif isinstance(_min, str): - CUSTOM_RTL_MIN = ord(_min) - if isinstance(_max, int): - CUSTOM_RTL_MAX = _max - elif isinstance(_max, str): - CUSTOM_RTL_MAX = ord(_max) - if isinstance(specials, str): - CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] - elif isinstance(specials, list): - CUSTOM_RTL_SPECIAL_CHARS = specials - return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS - - -def mult(m: List[float], n: List[float]) -> List[float]: - return [ - m[0] * n[0] + m[1] * n[2], - m[0] * n[1] + m[1] * n[3], - m[2] * n[0] + m[3] * n[2], - m[2] * n[1] + m[3] * n[3], - m[4] * n[0] + m[5] * n[2] + n[4], - m[4] * n[1] + m[5] * n[3] + n[5], - ] - - -def orient(m: List[float]) -> int: - if m[3] > 1e-6: - return 0 - elif m[3] < -1e-6: - return 180 - elif m[1] > 0: - return 90 - else: - return 270 - - -def crlf_space_check( - text: str, - cmtm_prev: Tuple[List[float], List[float]], - cmtm_matrix: Tuple[List[float], List[float]], - memo_cmtm: Tuple[List[float], List[float]], - cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], - orientations: Tuple[int, ...], - output: str, - font_size: float, - visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], - spacewidth: float, -) -> Tuple[str, str, List[float], List[float]]: - cm_prev = cmtm_prev[0] - tm_prev = cmtm_prev[1] - cm_matrix = cmtm_matrix[0] - tm_matrix = cmtm_matrix[1] - memo_cm = memo_cmtm[0] - memo_tm = memo_cmtm[1] - - m_prev = mult(tm_prev, cm_prev) - m = mult(tm_matrix, cm_matrix) - orientation = orient(m) - delta_x = m[4] - m_prev[4] - delta_y = m[5] - m_prev[5] - k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) - f = font_size * k - cm_prev = m - if orientation not in orientations: - raise OrientationNotFoundError - try: - if orientation == 0: - if delta_y < -0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif abs(delta_y) < f * 0.3 and abs(delta_x) > spacewidth * f * 15 and (output + text)[-1] != " ": - text += " " - elif orientation == 180: - if delta_y > 0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif abs(delta_y) < f * 0.3 and abs(delta_x) > spacewidth * f * 15 and (output + text)[-1] != " ": - text += " " - elif orientation == 90: - if delta_x > 0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif abs(delta_x) < f * 0.3 and abs(delta_y) > spacewidth * f * 15 and (output + text)[-1] != " ": - text += " " - elif orientation == 270: - if delta_x < -0.8 * f: - if (output + text)[-1] != "\n": - output += text + "\n" - if visitor_text is not None: - visitor_text( - text + "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - text = "" - elif abs(delta_x) < f * 0.3 and abs(delta_y) > spacewidth * f * 15 and (output + text)[-1] != " ": - text += " " - except Exception: - pass - tm_prev = tm_matrix.copy() - cm_prev = cm_matrix.copy() - return text, output, cm_prev, tm_prev - - -def handle_tj( - text: str, - operands: List[Union[str, TextStringObject]], - cm_matrix: List[float], - tm_matrix: List[float], - cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]], - orientations: Tuple[int, ...], - output: str, - font_size: float, - rtl_dir: bool, - visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], -) -> Tuple[str, bool]: - m = mult(tm_matrix, cm_matrix) - orientation = orient(m) - if orientation in orientations and len(operands) > 0: - if isinstance(operands[0], str): - text += operands[0] - else: - t: str = "" - tt: bytes = encode_pdfdocencoding(operands[0]) if isinstance(operands[0], str) else operands[0] - if isinstance(cmap[0], str): - try: - t = tt.decode(cmap[0], "surrogatepass") # apply str encoding - except Exception: - # the data does not match the expectation, - # we use the alternative ; - # text extraction may not be good - t = tt.decode( - "utf-16-be" if cmap[0] == "charmap" else "charmap", - "surrogatepass", - ) # apply str encoding - else: # apply dict encoding - t = "".join([cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt]) - # "\u0590 - \u08FF \uFB50 - \uFDFF" - for x in [cmap[1][x] if x in cmap[1] else x for x in t]: - # x can be a sequence of bytes ; ex: habibi.pdf - if len(x) == 1: - xx = ord(x) - else: - xx = 1 - # fmt: off - if ( - # cases where the current inserting order is kept - (xx <= 0x2F) # punctuations but... - or 0x3A <= xx <= 0x40 # numbers (x30-39) - or 0x2000 <= xx <= 0x206F # upper punctuations.. - or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents - or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... - ): - text = x + text if rtl_dir else text + x - elif ( # right-to-left characters set - 0x0590 <= xx <= 0x08FF - or 0xFB1D <= xx <= 0xFDFF - or 0xFE70 <= xx <= 0xFEFF - or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX - ): - if not rtl_dir: - rtl_dir = True - output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) - text = "" - text = x + text - else: # left-to-right - # print(">",xx,x,end="") - if rtl_dir: - rtl_dir = False - output += text - if visitor_text is not None: - visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) - text = "" - text = text + x - # fmt: on - return text, rtl_dir - - -def extract_page( - obj: Any, - pdf: Any, - orientations: Tuple[int, ...] = (0, 90, 180, 270), - space_width: float = 200.0, - content_key: Optional[str] = PG.CONTENTS, - visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, - visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, - visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, -) -> str: - """ - See extract_text for most arguments. - - Args: - content_key: indicate the default key where to extract data - None = the object; this allow to reuse the function on XObject - default = "/Content" - """ - text: str = "" - output: str = "" - rtl_dir: bool = False # right-to-left - cmaps: Dict[ - str, - Tuple[str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject], - ] = {} - try: - objr = obj - while NameObject(PG.RESOURCES) not in objr: - # /Resources can be inherited sometimes so we look to parents - objr = objr["/Parent"].get_object() - # if no parents we will have no /Resources will be available - # => an exception will be raised - resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) - except Exception: - # no resources means no text is possible (no font) we consider the - # file as not damaged, no need to check for TJ or Tj - return "" - if "/Font" in resources_dict: - for f in cast(DictionaryObject, resources_dict["/Font"]): - cmaps[f] = build_char_map(f, space_width, obj) - cmap: Tuple[Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject]] = ( - "charmap", - {}, - "NotInitialized", - None, - ) # (encoding,CMAP,font resource name,dictionary-object of font) - try: - content = obj[content_key].get_object() if isinstance(content_key, str) else obj - if not isinstance(content, ContentStream): - content = ContentStream(content, pdf, "bytes") - except KeyError: # it means no content can be extracted(certainly empty page) - return "" - # Note: we check all strings are TextStringObjects. ByteStringObjects - # are strings where the byte->string encoding was unknown, so adding - # them to the text here would be gibberish. - - cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - cm_stack = [] - tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - - # cm/tm_prev stores the last modified matrices can be an intermediate position - cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - - # memo_cm/tm will be used to store the position at the beginning of building the text - memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - char_scale = 1.0 - space_scale = 1.0 - _space_width: float = 500.0 # will be set correctly at first Tf - TL = 0.0 - font_size = 12.0 # init just in case of - - def current_spacewidth() -> float: - return _space_width / 1000.0 - - def process_operation(operator: bytes, operands: List[Any]) -> None: - nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm - nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap - nonlocal orientations, rtl_dir, visitor_text, output, text - global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS - - check_crlf_space: bool = False - # Table 5.4 page 405 - if operator == b"BT": - tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - text = "" - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - return None - elif operator == b"ET": - output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - text = "" - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - # table 4.7 "Graphics state operators", page 219 - # cm_matrix calculation is a reserved for the moment - elif operator == b"q": - cm_stack.append( - ( - cm_matrix, - cmap, - font_size, - char_scale, - space_scale, - _space_width, - TL, - ) - ) - elif operator == b"Q": - try: - ( - cm_matrix, - cmap, - font_size, - char_scale, - space_scale, - _space_width, - TL, - ) = cm_stack.pop() - except Exception: - cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] - elif operator == b"cm": - output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - text = "" - cm_matrix = mult( - [ - float(operands[0]), - float(operands[1]), - float(operands[2]), - float(operands[3]), - float(operands[4]), - float(operands[5]), - ], - cm_matrix, - ) - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - # Table 5.2 page 398 - elif operator == b"Tz": - char_scale = float(operands[0]) / 100.0 - elif operator == b"Tw": - space_scale = 1.0 + float(operands[0]) - elif operator == b"TL": - TL = float(operands[0]) - elif operator == b"Tf": - if text != "": - output += text # .translate(cmap) - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - text = "" - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - try: - # charMapTuple: font_type, float(sp_width / 2), encoding, - # map_dict, font-dictionary - charMapTuple = cmaps[operands[0]] - _space_width = charMapTuple[1] - # current cmap: encoding, map_dict, font resource name - # (internal name, not the real font-name), - # font-dictionary. The font-dictionary describes the font. - cmap = ( - charMapTuple[2], - charMapTuple[3], - operands[0], - charMapTuple[4], - ) - except KeyError: # font not found - _space_width = unknown_char_map[1] - cmap = ( - unknown_char_map[2], - unknown_char_map[3], - "???" + operands[0], - None, - ) - try: - font_size = float(operands[1]) - except Exception: - pass # keep previous size - # Table 5.5 page 406 - elif operator == b"Td": - check_crlf_space = True - # A special case is a translating only tm: - # tm[0..5] = 1 0 0 1 e f, - # i.e. tm[4] += tx, tm[5] += ty. - tx = float(operands[0]) - ty = float(operands[1]) - tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] - tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] - elif operator == b"Tm": - check_crlf_space = True - tm_matrix = [ - float(operands[0]), - float(operands[1]), - float(operands[2]), - float(operands[3]), - float(operands[4]), - float(operands[5]), - ] - elif operator == b"T*": - check_crlf_space = True - tm_matrix[5] -= TL - - elif operator == b"Tj": - check_crlf_space = True - text, rtl_dir = handle_tj( - text, - operands, - cm_matrix, - tm_matrix, # text matrix - cmap, - orientations, - output, - font_size, - rtl_dir, - visitor_text, - ) - else: - return None - if check_crlf_space: - try: - text, output, cm_prev, tm_prev = crlf_space_check( - text, - (cm_prev, tm_prev), - (cm_matrix, tm_matrix), - (memo_cm, memo_tm), - cmap, - orientations, - output, - font_size, - visitor_text, - current_spacewidth(), - ) - if text == "": - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - except OrientationNotFoundError: - return None - - for operands, operator in content.operations: - if visitor_operand_before is not None: - visitor_operand_before(operator, operands, cm_matrix, tm_matrix) - # multiple operators are defined in here #### - if operator == b"'": - process_operation(b"T*", []) - process_operation(b"Tj", operands) - elif operator == b'"': - process_operation(b"Tw", [operands[0]]) - process_operation(b"Tc", [operands[1]]) - process_operation(b"T*", []) - process_operation(b"Tj", operands[2:]) - elif operator == b"TD": - process_operation(b"TL", [-operands[1]]) - process_operation(b"Td", operands) - elif operator == b"TJ": - for op in operands[0]: - if isinstance(op, (str, bytes)): - process_operation(b"Tj", [op]) - if isinstance(op, (int, float, NumberObject, FloatObject)) and ((abs(float(op)) >= _space_width) and (len(text) > 0) and (text[-1] != " ")): - process_operation(b"Tj", [" "]) - elif operator == b"Do": - output += text - if visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - try: - if output[-1] != "\n": - output += "\n" - if visitor_text is not None: - visitor_text( - "\n", - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - except IndexError: - pass - try: - xobj = resources_dict["/XObject"] - if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore - text = self.extract_xform_text( - xobj[operands[0]], # type: ignore - orientations, - space_width, - visitor_operand_before, - visitor_operand_after, - visitor_text, - ) - output += text - if visitor_text is not None: - visitor_text( - text, - memo_cm, - memo_tm, - cmap[3], - font_size, - ) - except Exception: - print( - f" impossible to decode XFormObject {operands[0]}", - __name__, - ) - finally: - text = "" - memo_cm = cm_matrix.copy() - memo_tm = tm_matrix.copy() - - else: - process_operation(operator, operands) - if visitor_operand_after is not None: - visitor_operand_after(operator, operands, cm_matrix, tm_matrix) - output += text # just in case of - if text != "" and visitor_text is not None: - visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) - return output diff --git a/olmocr/prompts/anchor.py b/olmocr/prompts/anchor.py index 2c4d04d..b30413e 100644 --- a/olmocr/prompts/anchor.py +++ b/olmocr/prompts/anchor.py @@ -22,7 +22,6 @@ from pypdf import PdfReader from pypdf.generic import RectangleObject from olmocr.filter.coherency import get_document_coherency -from olmocr.prompts._adv_anchor import mult def get_anchor_text( @@ -95,6 +94,17 @@ def _transform_point(x, y, m): return x_new, y_new +def _mult(m: List[float], n: List[float]) -> List[float]: + return [ + m[0] * n[0] + m[1] * n[2], + m[0] * n[1] + m[1] * n[3], + m[2] * n[0] + m[3] * n[2], + m[2] * n[1] + m[3] * n[3], + m[4] * n[0] + m[5] * n[2] + n[4], + m[4] * n[1] + m[5] * n[3] + n[5], + ] + + @dataclass(frozen=True) class Element: pass @@ -140,7 +150,7 @@ def _pdf_report(local_pdf_path: str, page_num: int) -> PageReport: text_elements, image_elements = [], [] def visitor_body(text, cm, tm, font_dict, font_size): - txt2user = mult(tm, cm) + txt2user = _mult(tm, cm) text_elements.append(TextElement(text, txt2user[4], txt2user[5])) def visitor_op(op, args, cm, tm): diff --git a/scripts/benchmark_throughput.py b/scripts/benchmark_throughput.py index c410e96..8c11ecf 100644 --- a/scripts/benchmark_throughput.py +++ b/scripts/benchmark_throughput.py @@ -89,12 +89,12 @@ def sample_mm_requests_qwen2vl( text = processor.apply_chat_template(data["chat_messages"], tokenize=False, add_generation_prompt=True) raw_b64 = data["chat_messages"][0]["content"][1]["image_url"]["url"] - main_image = Image.open(BytesIO(base64.b64decode(raw_b64[raw_b64.find(",") + 1 :]))) + _main_image = Image.open(BytesIO(base64.b64decode(raw_b64[raw_b64.find(",") + 1 :]))) # Process inputs using processor inputs = processor( text=[text], - # images=[main_image], # Don't pad out the image tokens yet, since that happens later inside of birr + # images=[_main_image], # Don't pad out the image tokens yet, since that happens later inside of birr padding=True, return_tensors="np", ) diff --git a/scripts/movedolmadocs_to_md.py b/scripts/movedolmadocs_to_md.py index c144429..352e5b5 100644 --- a/scripts/movedolmadocs_to_md.py +++ b/scripts/movedolmadocs_to_md.py @@ -1,6 +1,5 @@ #!/usr/bin/env python3 import argparse -import io import json import os from urllib.parse import urlparse