import math from pypdf.generic import ( DictionaryObject, FloatObject, TextStringObject, NumberObject, NameObject ) from typing import ( Any, Callable, Dict, Iterable, Iterator, List, Optional, Sequence, Set, Tuple, Union, cast, overload, ) from pypdf.constants import AnnotationDictionaryAttributes as ADA from pypdf.constants import ImageAttributes as IA from pypdf.constants import PageAttributes as PG from pypdf.constants import Resources as RES from pypdf.generic import ContentStream, encode_pdfdocencoding from pypdf._cmap import build_char_map, unknown_char_map CUSTOM_RTL_MIN: int = -1 CUSTOM_RTL_MAX: int = -1 CUSTOM_RTL_SPECIAL_CHARS: List[int] = [] LAYOUT_NEW_BT_GROUP_SPACE_WIDTHS: int = 5 class OrientationNotFoundError(Exception): pass def set_custom_rtl( _min: Union[str, int, None] = None, _max: Union[str, int, None] = None, specials: Union[str, List[int], None] = None, ) -> Tuple[int, int, List[int]]: """ Change the Right-To-Left and special characters custom parameters. Args: _min: The new minimum value for the range of custom characters that will be written right to left. If set to ``None``, the value will not be changed. If set to an integer or string, it will be converted to its ASCII code. The default value is -1, which sets no additional range to be converted. _max: The new maximum value for the range of custom characters that will be written right to left. If set to ``None``, the value will not be changed. If set to an integer or string, it will be converted to its ASCII code. The default value is -1, which sets no additional range to be converted. specials: The new list of special characters to be inserted in the current insertion order. If set to ``None``, the current value will not be changed. If set to a string, it will be converted to a list of ASCII codes. The default value is an empty list. Returns: A tuple containing the new values for ``CUSTOM_RTL_MIN``, ``CUSTOM_RTL_MAX``, and ``CUSTOM_RTL_SPECIAL_CHARS``. """ global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS if isinstance(_min, int): CUSTOM_RTL_MIN = _min elif isinstance(_min, str): CUSTOM_RTL_MIN = ord(_min) if isinstance(_max, int): CUSTOM_RTL_MAX = _max elif isinstance(_max, str): CUSTOM_RTL_MAX = ord(_max) if isinstance(specials, str): CUSTOM_RTL_SPECIAL_CHARS = [ord(x) for x in specials] elif isinstance(specials, list): CUSTOM_RTL_SPECIAL_CHARS = specials return CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS def mult(m: List[float], n: List[float]) -> List[float]: return [ m[0] * n[0] + m[1] * n[2], m[0] * n[1] + m[1] * n[3], m[2] * n[0] + m[3] * n[2], m[2] * n[1] + m[3] * n[3], m[4] * n[0] + m[5] * n[2] + n[4], m[4] * n[1] + m[5] * n[3] + n[5], ] def orient(m: List[float]) -> int: if m[3] > 1e-6: return 0 elif m[3] < -1e-6: return 180 elif m[1] > 0: return 90 else: return 270 def crlf_space_check( text: str, cmtm_prev: Tuple[List[float], List[float]], cmtm_matrix: Tuple[List[float], List[float]], memo_cmtm: Tuple[List[float], List[float]], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], orientations: Tuple[int, ...], output: str, font_size: float, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], spacewidth: float, ) -> Tuple[str, str, List[float], List[float]]: cm_prev = cmtm_prev[0] tm_prev = cmtm_prev[1] cm_matrix = cmtm_matrix[0] tm_matrix = cmtm_matrix[1] memo_cm = memo_cmtm[0] memo_tm = memo_cmtm[1] m_prev = mult(tm_prev, cm_prev) m = mult(tm_matrix, cm_matrix) orientation = orient(m) delta_x = m[4] - m_prev[4] delta_y = m[5] - m_prev[5] k = math.sqrt(abs(m[0] * m[3]) + abs(m[1] * m[2])) f = font_size * k cm_prev = m if orientation not in orientations: raise OrientationNotFoundError try: if orientation == 0: if delta_y < -0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( text + "\n", memo_cm, memo_tm, cmap[3], font_size, ) text = "" elif ( abs(delta_y) < f * 0.3 and abs(delta_x) > spacewidth * f * 15 and (output + text)[-1] != " " ): text += " " elif orientation == 180: if delta_y > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( text + "\n", memo_cm, memo_tm, cmap[3], font_size, ) text = "" elif ( abs(delta_y) < f * 0.3 and abs(delta_x) > spacewidth * f * 15 and (output + text)[-1] != " " ): text += " " elif orientation == 90: if delta_x > 0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( text + "\n", memo_cm, memo_tm, cmap[3], font_size, ) text = "" elif ( abs(delta_x) < f * 0.3 and abs(delta_y) > spacewidth * f * 15 and (output + text)[-1] != " " ): text += " " elif orientation == 270: if delta_x < -0.8 * f: if (output + text)[-1] != "\n": output += text + "\n" if visitor_text is not None: visitor_text( text + "\n", memo_cm, memo_tm, cmap[3], font_size, ) text = "" elif ( abs(delta_x) < f * 0.3 and abs(delta_y) > spacewidth * f * 15 and (output + text)[-1] != " " ): text += " " except Exception: pass tm_prev = tm_matrix.copy() cm_prev = cm_matrix.copy() return text, output, cm_prev, tm_prev def handle_tj( text: str, operands: List[Union[str, TextStringObject]], cm_matrix: List[float], tm_matrix: List[float], cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ], orientations: Tuple[int, ...], output: str, font_size: float, rtl_dir: bool, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]], ) -> Tuple[str, bool]: m = mult(tm_matrix, cm_matrix) orientation = orient(m) if orientation in orientations and len(operands) > 0: if isinstance(operands[0], str): text += operands[0] else: t: str = "" tt: bytes = ( encode_pdfdocencoding(operands[0]) if isinstance(operands[0], str) else operands[0] ) if isinstance(cmap[0], str): try: t = tt.decode(cmap[0], "surrogatepass") # apply str encoding except Exception: # the data does not match the expectation, # we use the alternative ; # text extraction may not be good t = tt.decode( "utf-16-be" if cmap[0] == "charmap" else "charmap", "surrogatepass", ) # apply str encoding else: # apply dict encoding t = "".join( [cmap[0][x] if x in cmap[0] else bytes((x,)).decode() for x in tt] ) # "\u0590 - \u08FF \uFB50 - \uFDFF" for x in [cmap[1][x] if x in cmap[1] else x for x in t]: # x can be a sequence of bytes ; ex: habibi.pdf if len(x) == 1: xx = ord(x) else: xx = 1 # fmt: off if ( # cases where the current inserting order is kept (xx <= 0x2F) # punctuations but... or 0x3A <= xx <= 0x40 # numbers (x30-39) or 0x2000 <= xx <= 0x206F # upper punctuations.. or 0x20A0 <= xx <= 0x21FF # but (numbers) indices/exponents or xx in CUSTOM_RTL_SPECIAL_CHARS # customized.... ): text = x + text if rtl_dir else text + x elif ( # right-to-left characters set 0x0590 <= xx <= 0x08FF or 0xFB1D <= xx <= 0xFDFF or 0xFE70 <= xx <= 0xFEFF or CUSTOM_RTL_MIN <= xx <= CUSTOM_RTL_MAX ): if not rtl_dir: rtl_dir = True output += text if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = x + text else: # left-to-right # print(">",xx,x,end="") if rtl_dir: rtl_dir = False output += text if visitor_text is not None: visitor_text(text, cm_matrix, tm_matrix, cmap[3], font_size) text = "" text = text + x # fmt: on return text, rtl_dir def extract_page( obj: Any, pdf: Any, orientations: Tuple[int, ...] = (0, 90, 180, 270), space_width: float = 200.0, content_key: Optional[str] = PG.CONTENTS, visitor_operand_before: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_operand_after: Optional[Callable[[Any, Any, Any, Any], None]] = None, visitor_text: Optional[Callable[[Any, Any, Any, Any, Any], None]] = None, ) -> str: """ See extract_text for most arguments. Args: content_key: indicate the default key where to extract data None = the object; this allow to reuse the function on XObject default = "/Content" """ text: str = "" output: str = "" rtl_dir: bool = False # right-to-left cmaps: Dict[ str, Tuple[ str, float, Union[str, Dict[int, str]], Dict[str, str], DictionaryObject ], ] = {} try: objr = obj while NameObject(PG.RESOURCES) not in objr: # /Resources can be inherited sometimes so we look to parents objr = objr["/Parent"].get_object() # if no parents we will have no /Resources will be available # => an exception will be raised resources_dict = cast(DictionaryObject, objr[PG.RESOURCES]) except Exception: # no resources means no text is possible (no font) we consider the # file as not damaged, no need to check for TJ or Tj return "" if "/Font" in resources_dict: for f in cast(DictionaryObject, resources_dict["/Font"]): cmaps[f] = build_char_map(f, space_width, obj) cmap: Tuple[ Union[str, Dict[int, str]], Dict[str, str], str, Optional[DictionaryObject] ] = ( "charmap", {}, "NotInitialized", None, ) # (encoding,CMAP,font resource name,dictionary-object of font) try: content = ( obj[content_key].get_object() if isinstance(content_key, str) else obj ) if not isinstance(content, ContentStream): content = ContentStream(content, pdf, "bytes") except KeyError: # it means no content can be extracted(certainly empty page) return "" # Note: we check all strings are TextStringObjects. ByteStringObjects # are strings where the byte->string encoding was unknown, so adding # them to the text here would be gibberish. cm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] cm_stack = [] tm_matrix: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] # cm/tm_prev stores the last modified matrices can be an intermediate position cm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] tm_prev: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] # memo_cm/tm will be used to store the position at the beginning of building the text memo_cm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] memo_tm: List[float] = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] char_scale = 1.0 space_scale = 1.0 _space_width: float = 500.0 # will be set correctly at first Tf TL = 0.0 font_size = 12.0 # init just in case of def current_spacewidth() -> float: return _space_width / 1000.0 def process_operation(operator: bytes, operands: List[Any]) -> None: nonlocal cm_matrix, cm_stack, tm_matrix, cm_prev, tm_prev, memo_cm, memo_tm nonlocal char_scale, space_scale, _space_width, TL, font_size, cmap nonlocal orientations, rtl_dir, visitor_text, output, text global CUSTOM_RTL_MIN, CUSTOM_RTL_MAX, CUSTOM_RTL_SPECIAL_CHARS check_crlf_space: bool = False # Table 5.4 page 405 if operator == b"BT": tm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] output += text if visitor_text is not None: visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() return None elif operator == b"ET": output += text if visitor_text is not None: visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() # table 4.7 "Graphics state operators", page 219 # cm_matrix calculation is a reserved for the moment elif operator == b"q": cm_stack.append( ( cm_matrix, cmap, font_size, char_scale, space_scale, _space_width, TL, ) ) elif operator == b"Q": try: ( cm_matrix, cmap, font_size, char_scale, space_scale, _space_width, TL, ) = cm_stack.pop() except Exception: cm_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0] elif operator == b"cm": output += text if visitor_text is not None: visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" cm_matrix = mult( [ float(operands[0]), float(operands[1]), float(operands[2]), float(operands[3]), float(operands[4]), float(operands[5]), ], cm_matrix, ) memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() # Table 5.2 page 398 elif operator == b"Tz": char_scale = float(operands[0]) / 100.0 elif operator == b"Tw": space_scale = 1.0 + float(operands[0]) elif operator == b"TL": TL = float(operands[0]) elif operator == b"Tf": if text != "": output += text # .translate(cmap) if visitor_text is not None: visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() try: # charMapTuple: font_type, float(sp_width / 2), encoding, # map_dict, font-dictionary charMapTuple = cmaps[operands[0]] _space_width = charMapTuple[1] # current cmap: encoding, map_dict, font resource name # (internal name, not the real font-name), # font-dictionary. The font-dictionary describes the font. cmap = ( charMapTuple[2], charMapTuple[3], operands[0], charMapTuple[4], ) except KeyError: # font not found _space_width = unknown_char_map[1] cmap = ( unknown_char_map[2], unknown_char_map[3], "???" + operands[0], None, ) try: font_size = float(operands[1]) except Exception: pass # keep previous size # Table 5.5 page 406 elif operator == b"Td": check_crlf_space = True # A special case is a translating only tm: # tm[0..5] = 1 0 0 1 e f, # i.e. tm[4] += tx, tm[5] += ty. tx = float(operands[0]) ty = float(operands[1]) tm_matrix[4] += tx * tm_matrix[0] + ty * tm_matrix[2] tm_matrix[5] += tx * tm_matrix[1] + ty * tm_matrix[3] elif operator == b"Tm": check_crlf_space = True tm_matrix = [ float(operands[0]), float(operands[1]), float(operands[2]), float(operands[3]), float(operands[4]), float(operands[5]), ] elif operator == b"T*": check_crlf_space = True tm_matrix[5] -= TL elif operator == b"Tj": check_crlf_space = True text, rtl_dir = handle_tj( text, operands, cm_matrix, tm_matrix, # text matrix cmap, orientations, output, font_size, rtl_dir, visitor_text, ) else: return None if check_crlf_space: try: text, output, cm_prev, tm_prev = crlf_space_check( text, (cm_prev, tm_prev), (cm_matrix, tm_matrix), (memo_cm, memo_tm), cmap, orientations, output, font_size, visitor_text, current_spacewidth(), ) if text == "": memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() except OrientationNotFoundError: return None for operands, operator in content.operations: if visitor_operand_before is not None: visitor_operand_before(operator, operands, cm_matrix, tm_matrix) # multiple operators are defined in here #### if operator == b"'": process_operation(b"T*", []) process_operation(b"Tj", operands) elif operator == b'"': process_operation(b"Tw", [operands[0]]) process_operation(b"Tc", [operands[1]]) process_operation(b"T*", []) process_operation(b"Tj", operands[2:]) elif operator == b"TD": process_operation(b"TL", [-operands[1]]) process_operation(b"Td", operands) elif operator == b"TJ": for op in operands[0]: if isinstance(op, (str, bytes)): process_operation(b"Tj", [op]) if isinstance(op, (int, float, NumberObject, FloatObject)) and ( (abs(float(op)) >= _space_width) and (len(text) > 0) and (text[-1] != " ") ): process_operation(b"Tj", [" "]) elif operator == b"Do": output += text if visitor_text is not None: visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) try: if output[-1] != "\n": output += "\n" if visitor_text is not None: visitor_text( "\n", memo_cm, memo_tm, cmap[3], font_size, ) except IndexError: pass try: xobj = resources_dict["/XObject"] if xobj[operands[0]]["/Subtype"] != "/Image": # type: ignore text = self.extract_xform_text( xobj[operands[0]], # type: ignore orientations, space_width, visitor_operand_before, visitor_operand_after, visitor_text, ) output += text if visitor_text is not None: visitor_text( text, memo_cm, memo_tm, cmap[3], font_size, ) except Exception: logger_warning( f" impossible to decode XFormObject {operands[0]}", __name__, ) finally: text = "" memo_cm = cm_matrix.copy() memo_tm = tm_matrix.copy() else: process_operation(operator, operands) if visitor_operand_after is not None: visitor_operand_after(operator, operands, cm_matrix, tm_matrix) output += text # just in case of if text != "" and visitor_text is not None: visitor_text(text, memo_cm, memo_tm, cmap[3], font_size) return output