ragflow/deepdoc/parser/pdf_parser.py

# -*- coding: utf-8 -*-
import random

import fitz
import xgboost as xgb
from io import BytesIO
import torch
import re
import pdfplumber
import logging
from PIL import Image, ImageDraw
import numpy as np

from api.db import ParserType
from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
from rag.nlp import huqie
from copy import deepcopy
from huggingface_hub import hf_hub_download


logging.getLogger("pdfminer").setLevel(logging.WARNING)


class HuParser:
    def __init__(self):
        self.ocr = OCR()
        if hasattr(self, "model_speciess"):
            self.layouter = LayoutRecognizer("layout."+self.model_speciess)
        else:
            self.layouter = LayoutRecognizer("layout")
        self.tbl_det = TableStructureRecognizer()

        self.updown_cnt_mdl = xgb.Booster()
        if torch.cuda.is_available():
            self.updown_cnt_mdl.set_param({"device": "cuda"})
        self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
                                                       filename="updown_concat_xgb.model"))
        self.page_from = 0
        """
        If you have trouble downloading HuggingFace models, -_^ this might help!!

        For Linux:
        export HF_ENDPOINT=https://hf-mirror.com

        For Windows:
        Good luck
        ^_-

        """

    def __char_width(self, c):
        return (c["x1"] - c["x0"]) // len(c["text"])

    def __height(self, c):
        return c["bottom"] - c["top"]

    def _x_dis(self, a, b):
        return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
                   abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)

    def _y_dis(
            self, a, b):
        return (
                       b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2

    def _match_proj(self, b):
        proj_patt = [
            r"第[零一二三四五六七八九十百]+章",
            r"第[零一二三四五六七八九十百]+[条节]",
            r"[零一二三四五六七八九十百]+[、是 　]",
            r"[\(（][零一二三四五六七八九十百]+[）\)]",
            r"[\(（][0-9]+[）\)]",
            r"[0-9]+(、|\.[　 ]|）|\.[^0-9./a-zA-Z_%><-]{4,})",
            r"[0-9]+\.[0-9.]+(、|\.[ 　])",
            r"[⚫•➢①② ]",
        ]
        return any([re.match(p, b["text"]) for p in proj_patt])

    def _updown_concat_features(self, up, down):
        w = max(self.__char_width(up), self.__char_width(down))
        h = max(self.__height(up), self.__height(down))
        y_dis = self._y_dis(up, down)
        LEN = 6
        tks_down = huqie.qie(down["text"][:LEN]).split(" ")
        tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
        tks_all = up["text"][-LEN:].strip() \
                  + (" " if re.match(r"[a-zA-Z0-9]+",
                                     up["text"][-1] + down["text"][0]) else "") \
                  + down["text"][:LEN].strip()
        tks_all = huqie.qie(tks_all).split(" ")
        fea = [
            up.get("R", -1) == down.get("R", -1),
            y_dis / h,
            down["page_number"] - up["page_number"],
            up["layout_type"] == down["layout_type"],
            up["layout_type"] == "text",
            down["layout_type"] == "text",
            up["layout_type"] == "table",
            down["layout_type"] == "table",
            True if re.search(
                r"([。？！；!?;+)）]|[a-z]\.)$",
                up["text"]) else False,
            True if re.search(r"[，：‘“、0-9（+-]$", up["text"]) else False,
            True if re.search(
                r"(^.?[/,?;:\]，。；：’”？！》】）-])",
                down["text"]) else False,
            True if re.match(r"[\(（][^\(\)（）]+[）\)]$", up["text"]) else False,
            True if re.search(r"[，,][^。.]+$", up["text"]) else False,
            True if re.search(r"[，,][^。.]+$", up["text"]) else False,
            True if re.search(r"[\(（][^\)）]+$", up["text"])
                    and re.search(r"[\)）]", down["text"]) else False,
            self._match_proj(down),
            True if re.match(r"[A-Z]", down["text"]) else False,
            True if re.match(r"[A-Z]", up["text"][-1]) else False,
            True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
            True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
            up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
                                                                        ) > 1 and len(
                down["text"].strip()) > 1 else False,
            up["x0"] > down["x1"],
            abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
                                                               self.__height(down)),
            self._x_dis(up, down) / max(w, 0.000001),
            (len(up["text"]) - len(down["text"])) /
            max(len(up["text"]), len(down["text"])),
            len(tks_all) - len(tks_up) - len(tks_down),
            len(tks_down) - len(tks_up),
            tks_down[-1] == tks_up[-1],
            max(down["in_row"], up["in_row"]),
            abs(down["in_row"] - up["in_row"]),
            len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
            len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
        ]
        return fea

    @staticmethod
    def sort_X_by_page(arr, threashold):
        # sort using y1 first and then x1
        arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
        for i in range(len(arr) - 1):
            for j in range(i, -1, -1):
                # restore the order using th
                if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
                        and arr[j + 1]["top"] < arr[j]["top"]\
                        and arr[j + 1]["page_number"] == arr[j]["page_number"]:
                    tmp = arr[j]
                    arr[j] = arr[j + 1]
                    arr[j + 1] = tmp
        return arr

    def _has_color(self, o):
        if o.get("ncs", "") == "DeviceGray":
            if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
                    o["non_stroking_color"][0] == 1:
                if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
                    return False
        return True

    def _table_transformer_job(self, ZM):
        logging.info("Table processing...")
        imgs, pos = [], []
        tbcnt = [0]
        MARGIN = 10
        self.tb_cpns = []
        assert len(self.page_layout) == len(self.page_images)
        for p, tbls in enumerate(self.page_layout):  # for page
            tbls = [f for f in tbls if f["type"] == "table"]
            tbcnt.append(len(tbls))
            if not tbls:
                continue
            for tb in tbls:  # for table
                left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
                                         tb["x1"] + MARGIN, tb["bottom"] + MARGIN
                left *= ZM
                top *= ZM
                right *= ZM
                bott *= ZM
                pos.append((left, top))
                imgs.append(self.page_images[p].crop((left, top, right, bott)))

        assert len(self.page_images) == len(tbcnt) - 1
        if not imgs:
            return
        recos = self.tbl_det(imgs)
        tbcnt = np.cumsum(tbcnt)
        for i in range(len(tbcnt) - 1):  # for page
            pg = []
            for j, tb_items in enumerate(
                    recos[tbcnt[i]: tbcnt[i + 1]]):  # for table
                poss = pos[tbcnt[i]: tbcnt[i + 1]]
                for it in tb_items:  # for table components
                    it["x0"] = (it["x0"] + poss[j][0])
                    it["x1"] = (it["x1"] + poss[j][0])
                    it["top"] = (it["top"] + poss[j][1])
                    it["bottom"] = (it["bottom"] + poss[j][1])
                    for n in ["x0", "x1", "top", "bottom"]:
                        it[n] /= ZM
                    it["top"] += self.page_cum_height[i]
                    it["bottom"] += self.page_cum_height[i]
                    it["pn"] = i
                    it["layoutno"] = j
                    pg.append(it)
            self.tb_cpns.extend(pg)

        def gather(kwd, fzy=10, ption=0.6):
            eles = Recognizer.sort_Y_firstly(
                [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
            eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
            return Recognizer.sort_Y_firstly(eles, 0)

        # add R,H,C,SP tag to boxes within table layout
        headers = gather(r".*header$")
        rows = gather(r".* (row|header)")
        spans = gather(r".*spanning")
        clmns = sorted([r for r in self.tb_cpns if re.match(
            r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
        clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
        for b in self.boxes:
            if b.get("layout_type", "") != "table":
                continue
            ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
            if ii is not None:
                b["R"] = ii
                b["R_top"] = rows[ii]["top"]
                b["R_bott"] = rows[ii]["bottom"]

            ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
            if ii is not None:
                b["H_top"] = headers[ii]["top"]
                b["H_bott"] = headers[ii]["bottom"]
                b["H_left"] = headers[ii]["x0"]
                b["H_right"] = headers[ii]["x1"]
                b["H"] = ii

            ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
            if ii is not None:
                b["C"] = ii
                b["C_left"] = clmns[ii]["x0"]
                b["C_right"] = clmns[ii]["x1"]

            ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
            if ii is not None:
                b["H_top"] = spans[ii]["top"]
                b["H_bott"] = spans[ii]["bottom"]
                b["H_left"] = spans[ii]["x0"]
                b["H_right"] = spans[ii]["x1"]
                b["SP"] = ii

    def __ocr(self, pagenum, img, chars, ZM=3):
        bxs = self.ocr(np.array(img))
        if not bxs:
            self.boxes.append([])
            return
        bxs = [(line[0], line[1][0]) for line in bxs]
        bxs = Recognizer.sort_Y_firstly(
            [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
              "top": b[0][1] / ZM, "text": "", "txt": t,
              "bottom": b[-1][1] / ZM,
              "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
            self.mean_height[-1] / 3
        )

        # merge chars in the same rect
        for c in Recognizer.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
            ii = Recognizer.find_overlapped(c, bxs)
            if ii is None:
                self.lefted_chars.append(c)
                continue
            ch = c["bottom"] - c["top"]
            bh = bxs[ii]["bottom"] - bxs[ii]["top"]
            if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
                self.lefted_chars.append(c)
                continue
            if c["text"] == " " and bxs[ii]["text"]:
                if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
            else:
                bxs[ii]["text"] += c["text"]

        for b in bxs:
            if not b["text"]:
                b["text"] = b["txt"]
            del b["txt"]
        if self.mean_height[-1] == 0:
            self.mean_height[-1] = np.median([b["bottom"] - b["top"]
                                              for b in bxs])
        self.boxes.append(bxs)

    def _layouts_rec(self, ZM):
        assert len(self.page_images) == len(self.boxes)
        self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM)
        # cumlative Y
        for i in range(len(self.boxes)):
            self.boxes[i]["top"] += \
                self.page_cum_height[self.boxes[i]["page_number"] - 1]
            self.boxes[i]["bottom"] += \
                self.page_cum_height[self.boxes[i]["page_number"] - 1]

    def _text_merge(self):
        # merge adjusted boxes
        bxs = self.boxes

        def end_with(b, txt):
            txt = txt.strip()
            tt = b.get("text", "").strip()
            return tt and tt.find(txt) == len(tt) - len(txt)

        def start_with(b, txts):
            tt = b.get("text", "").strip()
            return tt and any([tt.find(t.strip()) == 0 for t in txts])

        # horizontally merge adjacent box with the same layout
        i = 0
        while i < len(bxs) - 1:
            b = bxs[i]
            b_ = bxs[i + 1]
            if b.get("layoutno", "0") != b_.get("layoutno", "1"):
                i += 1
                continue

            dis_thr = 1
            dis = b["x1"] - b_["x0"]
            if b.get("layout_type", "") != "text" or b_.get(
                    "layout_type", "") != "text":
                if end_with(b, "，") or start_with(b_, "（，"):
                    dis_thr = -8
                else:
                    i += 1
                    continue

            if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
                    and dis >= dis_thr and b["x1"] < b_["x1"]:
                # merge
                bxs[i]["x1"] = b_["x1"]
                bxs[i]["top"] = (b["top"] + b_["top"]) / 2
                bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
                bxs[i]["text"] += b_["text"]
                bxs.pop(i + 1)
                continue
            i += 1
        self.boxes = bxs

    def _naive_vertical_merge(self):
        bxs = Recognizer.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
        i = 0
        while i + 1 < len(bxs):
            b = bxs[i]
            b_ = bxs[i + 1]
            if b["page_number"] < b_["page_number"] and re.match(r"[0-9  •一—-]+$", b["text"]):
                bxs.pop(i)
                continue
            if not b["text"].strip():
                bxs.pop(i)
                continue
            concatting_feats = [
                b["text"].strip()[-1] in ",;:'\"，、‘“；：-",
                len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\"，‘“、；：",
                b["text"].strip()[0] in "。；？！?”）),，、：",
            ]
            # features for not concating
            feats = [
                b.get("layoutno", 0) != b.get("layoutno", 0),
                b["text"].strip()[-1] in "。？！?",
                self.is_english and b["text"].strip()[-1] in ".!?",
                b["page_number"] == b_["page_number"] and b_["top"] - \
                b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
                b["page_number"] < b_["page_number"] and abs(
                    b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
            ]
            if any(feats) and not any(concatting_feats):
                i += 1
                continue
            # merge up and down
            b["bottom"] = b_["bottom"]
            b["text"] += b_["text"]
            b["x0"] = min(b["x0"], b_["x0"])
            b["x1"] = max(b["x1"], b_["x1"])
            bxs.pop(i + 1)
        self.boxes = bxs

    def _concat_downward(self, concat_between_pages=True):
        # count boxes in the same row as a feature
        for i in range(len(self.boxes)):
            mh = self.mean_height[self.boxes[i]["page_number"] - 1]
            self.boxes[i]["in_row"] = 0
            j = max(0, i - 12)
            while j < min(i + 12, len(self.boxes)):
                if j == i:
                    j += 1
                    continue
                ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
                if abs(ydis) < 1:
                    self.boxes[i]["in_row"] += 1
                elif ydis > 0:
                    break
                j += 1

        # concat between rows
        boxes = deepcopy(self.boxes)
        blocks = []
        while boxes:
            chunks = []

            def dfs(up, dp):
                chunks.append(up)
                i = dp
                while i < min(dp + 12, len(boxes)):
                    ydis = self._y_dis(up, boxes[i])
                    smpg = up["page_number"] == boxes[i]["page_number"]
                    mh = self.mean_height[up["page_number"] - 1]
                    mw = self.mean_width[up["page_number"] - 1]
                    if smpg and ydis > mh * 4:
                        break
                    if not smpg and ydis > mh * 16:
                        break
                    down = boxes[i]
                    if not concat_between_pages and down["page_number"] > up["page_number"]:
                        break

                    if up.get("R", "") != down.get(
                            "R", "") and up["text"][-1] != "，":
                        i += 1
                        continue

                    if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
                            or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
                        i += 1
                        continue

                    if not down["text"].strip():
                        i += 1
                        continue

                    if up["x1"] < down["x0"] - 10 * \
                            mw or up["x0"] > down["x1"] + 10 * mw:
                        i += 1
                        continue

                    if i - dp < 5 and up.get("layout_type") == "text":
                        if up.get("layoutno", "1") == down.get(
                                "layoutno", "2"):
                            dfs(down, i + 1)
                            boxes.pop(i)
                            return
                        i += 1
                        continue

                    fea = self._updown_concat_features(up, down)
                    if self.updown_cnt_mdl.predict(
                            xgb.DMatrix([fea]))[0] <= 0.5:
                        i += 1
                        continue
                    dfs(down, i + 1)
                    boxes.pop(i)
                    return

            dfs(boxes[0], 1)
            boxes.pop(0)
            if chunks:
                blocks.append(chunks)

        # concat within each block
        boxes = []
        for b in blocks:
            if len(b) == 1:
                boxes.append(b[0])
                continue
            t = b[0]
            for c in b[1:]:
                t["text"] = t["text"].strip()
                c["text"] = c["text"].strip()
                if not c["text"]:
                    continue
                if t["text"] and re.match(
                        r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
                    t["text"] += " "
                t["text"] += c["text"]
                t["x0"] = min(t["x0"], c["x0"])
                t["x1"] = max(t["x1"], c["x1"])
                t["page_number"] = min(t["page_number"], c["page_number"])
                t["bottom"] = c["bottom"]
                if not t["layout_type"] \
                        and c["layout_type"]:
                    t["layout_type"] = c["layout_type"]
            boxes.append(t)

        self.boxes = Recognizer.sort_Y_firstly(boxes, 0)

    def _filter_forpages(self):
        if not self.boxes:
            return
        findit = False
        i = 0
        while i < len(self.boxes):
            if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
                i += 1
                continue
            findit = True
            eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
            self.boxes.pop(i)
            if i >= len(self.boxes): break
            prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
            while not prefix:
                self.boxes.pop(i)
                if i >= len(self.boxes): break
                prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
            self.boxes.pop(i)
            if i >= len(self.boxes) or not prefix: break
            for j in range(i, min(i + 128, len(self.boxes))):
                if not re.match(prefix, self.boxes[j]["text"]):
                    continue
                for k in range(i, j): self.boxes.pop(i)
                break
        if findit:return

        page_dirty = [0] * len(self.page_images)
        for b in self.boxes:
            if re.search(r"(··|··|··)", b["text"]):
                page_dirty[b["page_number"]-1] += 1
        page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
        if not page_dirty: return
        i = 0
        while i < len(self.boxes):
            if self.boxes[i]["page_number"] in page_dirty:
                self.boxes.pop(i)
                continue
            i += 1

    def _merge_with_same_bullet(self):
        i = 0
        while i + 1 < len(self.boxes):
            b = self.boxes[i]
            b_ = self.boxes[i + 1]
            if not b["text"].strip():
                self.boxes.pop(i)
                continue
            if not b_["text"].strip():
                self.boxes.pop(i+1)
                continue

            if b["text"].strip()[0] != b_["text"].strip()[0] \
                    or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
                    or huqie.is_chinese(b["text"].strip()[0]) \
                    or b["top"] > b_["bottom"]:
                i += 1
                continue
            b_["text"] = b["text"] + "\n" + b_["text"]
            b_["x0"] = min(b["x0"], b_["x0"])
            b_["x1"] = max(b["x1"], b_["x1"])
            b_["top"] = b["top"]
            self.boxes.pop(i)

    def _extract_table_figure(self, need_image, ZM, return_html, need_position):
        tables = {}
        figures = {}
        # extract figure and table boxes
        i = 0
        lst_lout_no = ""
        nomerge_lout_no = []
        while i < len(self.boxes):
            if "layoutno" not in self.boxes[i]:
                i += 1
                continue
            lout_no = str(self.boxes[i]["page_number"]) + \
                      "-" + str(self.boxes[i]["layoutno"])
            if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
                                                                                  "figure caption", "reference"]:
                nomerge_lout_no.append(lst_lout_no)
            if self.boxes[i]["layout_type"] == "table":
                if re.match(r"(数据|资料|图表)*来源[:： ]", self.boxes[i]["text"]):
                    self.boxes.pop(i)
                    continue
                if lout_no not in tables:
                    tables[lout_no] = []
                tables[lout_no].append(self.boxes[i])
                self.boxes.pop(i)
                lst_lout_no = lout_no
                continue
            if need_image and self.boxes[i]["layout_type"] == "figure":
                if re.match(r"(数据|资料|图表)*来源[:： ]", self.boxes[i]["text"]):
                    self.boxes.pop(i)
                    continue
                if lout_no not in figures:
                    figures[lout_no] = []
                figures[lout_no].append(self.boxes[i])
                self.boxes.pop(i)
                lst_lout_no = lout_no
                continue
            i += 1

        # merge table on different pages
        nomerge_lout_no = set(nomerge_lout_no)
        tbls = sorted([(k, bxs) for k, bxs in tables.items()],
                      key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))

        i = len(tbls) - 1
        while i - 1 >= 0:
            k0, bxs0 = tbls[i - 1]
            k, bxs = tbls[i]
            i -= 1
            if k0 in nomerge_lout_no:
                continue
            if bxs[0]["page_number"] == bxs0[0]["page_number"]:
                continue
            if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
                continue
            mh = self.mean_height[bxs[0]["page_number"] - 1]
            if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
                continue
            tables[k0].extend(tables[k])
            del tables[k]

        def x_overlapped(a, b):
            return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])

        # find captions and pop out
        i = 0
        while i < len(self.boxes):
            c = self.boxes[i]
            # mh = self.mean_height[c["page_number"]-1]
            if not TableStructureRecognizer.is_caption(c):
                i += 1
                continue

            # find the nearest layouts
            def nearest(tbls):
                nonlocal c
                mink = ""
                minv = 1000000000
                for k, bxs in tbls.items():
                    for b in bxs[:10]:
                        if b.get("layout_type", "").find("caption") >= 0:
                            continue
                        y_dis = self._y_dis(c, b)
                        x_dis = self._x_dis(
                            c, b) if not x_overlapped(
                            c, b) else 0
                        dis = y_dis * y_dis + x_dis * x_dis
                        if dis < minv:
                            mink = k
                            minv = dis
                return mink, minv

            tk, tv = nearest(tables)
            fk, fv = nearest(figures)
            if min(tv, fv) > 2000:
                i += 1
                continue
            if tv < fv:
                tables[tk].insert(0, c)
                logging.debug(
                    "TABLE:" +
                    self.boxes[i]["text"] +
                    "; Cap: " +
                    tk)
            else:
                figures[fk].insert(0, c)
                logging.debug(
                    "FIGURE:" +
                    self.boxes[i]["text"] +
                    "; Cap: " +
                    tk)
            self.boxes.pop(i)

        res = []
        positions = []

        def cropout(bxs, ltype, poss):
            nonlocal ZM
            pn = set([b["page_number"] - 1 for b in bxs])
            if len(pn) < 2:
                pn = list(pn)[0]
                ht = self.page_cum_height[pn]
                b = {
                    "x0": np.min([b["x0"] for b in bxs]),
                    "top": np.min([b["top"] for b in bxs]) - ht,
                    "x1": np.max([b["x1"] for b in bxs]),
                    "bottom": np.max([b["bottom"] for b in bxs]) - ht
                }
                louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
                ii = Recognizer.find_overlapped(b, louts, naive=True)
                if ii is not None:
                    b = louts[ii]
                else:
                    logging.warn(
                        f"Missing layout match: {pn + 1},%s" %
                        (bxs[0].get(
                            "layoutno", "")))

                left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
                poss.append((pn+self.page_from, left, right, top, bott))
                return self.page_images[pn] \
                    .crop((left * ZM, top * ZM,
                           right * ZM, bott * ZM))
            pn = {}
            for b in bxs:
                p = b["page_number"] - 1
                if p not in pn:
                    pn[p] = []
                pn[p].append(b)
            pn = sorted(pn.items(), key=lambda x: x[0])
            imgs = [cropout(arr, ltype, poss) for p, arr in pn]
            pic = Image.new("RGB",
                            (int(np.max([i.size[0] for i in imgs])),
                             int(np.sum([m.size[1] for m in imgs]))),
                            (245, 245, 245))
            height = 0
            for img in imgs:
                pic.paste(img, (0, int(height)))
                height += img.size[1]
            return pic

        # crop figure out and add caption
        for k, bxs in figures.items():
            txt = "\n".join(
                [b["text"] for b in bxs
                 if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
                 and len(b["text"].strip()) >= 4
                 ]
            )
            if not txt:
                continue

            poss = []
            res.append(
                (cropout(
                    bxs,
                    "figure", poss),
                 [txt]))
            positions.append(poss)

        for k, bxs in tables.items():
            if not bxs:
                continue
            bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"]-b["top"])/2 for b in bxs]))
            poss = []
            res.append((cropout(bxs, "table", poss),
                        self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
            positions.append(poss)

        assert len(positions) == len(res)

        if need_position: return list(zip(res, positions))
        return res

    def proj_match(self, line):
        if len(line) <= 2:
            return
        if re.match(r"[0-9 ().,%%+/-]+$", line):
            return False
        for p, j in [
            (r"第[零一二三四五六七八九十百]+章", 1),
            (r"第[零一二三四五六七八九十百]+[条节]", 2),
            (r"[零一二三四五六七八九十百]+[、 　]", 3),
            (r"[\(（][零一二三四五六七八九十百]+[）\)]", 4),
            (r"[0-9]+(、|\.[　 ]|\.[^0-9])", 5),
            (r"[0-9]+\.[0-9]+(、|[. 　]|[^0-9])", 6),
            (r"[0-9]+\.[0-9]+\.[0-9]+(、|[ 　]|[^0-9])", 7),
            (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[ 　]|[^0-9])", 8),
            (r".{,48}[：:?？]$", 9),
            (r"[0-9]+）", 10),
            (r"[\(（][0-9]+[）\)]", 11),
            (r"[零一二三四五六七八九十百]+是", 12),
            (r"[⚫•➢✓]", 12)
        ]:
            if re.match(p, line):
                return j
        return

    def _line_tag(self, bx, ZM):
        pn = [bx["page_number"]]
        top = bx["top"] - self.page_cum_height[pn[0] - 1]
        bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
        while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
            bott -= self.page_images[pn[-1] - 1].size[1] / ZM
            pn.append(pn[-1] + 1)

        return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
            .format("-".join([str(p) for p in pn]),
                    bx["x0"], bx["x1"], top, bott)

    def __filterout_scraps(self, boxes, ZM):

        def width(b):
            return b["x1"] - b["x0"]

        def height(b):
            return b["bottom"] - b["top"]

        def usefull(b):
            if b.get("layout_type"):
                return True
            if width(
                    b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
                return True
            if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
                return True
            return False

        res = []
        while boxes:
            lines = []
            widths = []
            pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
            mh = self.mean_height[boxes[0]["page_number"] - 1]
            mj = self.proj_match(
                boxes[0]["text"]) or boxes[0].get(
                "layout_type",
                "") == "title"

            def dfs(line, st):
                nonlocal mh, pw, lines, widths
                lines.append(line)
                widths.append(width(line))
                width_mean = np.mean(widths)
                mmj = self.proj_match(
                    line["text"]) or line.get(
                    "layout_type",
                    "") == "title"
                for i in range(st + 1, min(st + 20, len(boxes))):
                    if (boxes[i]["page_number"] - line["page_number"]) > 0:
                        break
                    if not mmj and self._y_dis(
                            line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
                        break

                    if not usefull(boxes[i]):
                        continue
                    if mmj or \
                            (self._x_dis(boxes[i], line) < pw / 10): \
                            # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
                        # concat following
                        dfs(boxes[i], i)
                        boxes.pop(i)
                        break

            try:
                if usefull(boxes[0]):
                    dfs(boxes[0], 0)
                else:
                    logging.debug("WASTE: " + boxes[0]["text"])
            except Exception as e:
                pass
            boxes.pop(0)
            mw = np.mean(widths)
            if mj or mw / pw >= 0.35 or mw > 200:
                res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
            else:
                logging.debug("REMOVED: " +
                              "<<".join([c["text"] for c in lines]))

        return "\n\n".join(res)

    @staticmethod
    def total_page_number(fnm, binary=None):
        try:
            pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
            return len(pdf.pages)
        except Exception as e:
            pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
            return len(pdf)

    def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
        self.lefted_chars = []
        self.mean_height = []
        self.mean_width = []
        self.boxes = []
        self.garbages = {}
        self.page_cum_height = [0]
        self.page_layout = []
        self.page_from = page_from
        try:
            self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
            self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
                                enumerate(self.pdf.pages[page_from:page_to])]
            self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
            self.total_page = len(self.pdf.pages)
        except Exception as e:
            self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
            self.page_images = []
            self.page_chars = []
            mat = fitz.Matrix(zoomin, zoomin)
            self.total_page = len(self.pdf)
            for i, page in enumerate(self.pdf):
                if i < page_from:continue
                if i >= page_to:break
                pix = page.get_pixmap(matrix=mat)
                img = Image.frombytes("RGB", [pix.width, pix.height],
                                      pix.samples)
                self.page_images.append(img)
                self.page_chars.append([])

        logging.info("Images converted.")
        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
        if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
            self.is_english = True
        else:
            self.is_english = False

        for i, img in enumerate(self.page_images):
            chars = self.page_chars[i] if not self.is_english else []
            self.mean_height.append(
                np.median(sorted([c["height"] for c in chars])) if chars else 0
            )
            self.mean_width.append(
                np.median(sorted([c["width"] for c in chars])) if chars else 8
            )
            self.page_cum_height.append(img.size[1] / zoomin)
            j = 0
            while j + 1 < len(chars):
                if chars[j]["text"] and chars[j + 1]["text"] \
                        and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
                        and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
                                                                       chars[j]["width"]) / 2:
                    chars[j]["text"] += " "
                j += 1
            # if i > 0:
            #     if not chars:
            #         self.page_cum_height.append(img.size[1] / zoomin)
            #     else:
            #         self.page_cum_height.append(
            #             np.max([c["bottom"] for c in chars]))
            self.__ocr(i + 1, img, chars, zoomin)
            if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="")

        if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
            bxes = [b for bxs in self.boxes for b in bxs]
            self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))

        logging.info("Is it English:", self.is_english)

        self.page_cum_height = np.cumsum(self.page_cum_height)
        assert len(self.page_cum_height) == len(self.page_images) + 1

    def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
        self.__images__(fnm, zoomin)
        self._layouts_rec(zoomin)
        self._table_transformer_job(zoomin)
        self._text_merge()
        self._concat_downward()
        self._filter_forpages()
        tbls = self._extract_table_figure(need_image, zoomin, return_html, False)
        return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls

    def remove_tag(self, txt):
        return re.sub(r"@@[\t0-9.-]+?##", "", txt)

    def crop(self, text, ZM=3, need_position=False):
        imgs = []
        poss = []
        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
            pn, left, right, top, bottom = tag.strip(
                "#").strip("@").split("\t")
            left, right, top, bottom = float(left), float(
                right), float(top), float(bottom)
            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
        if not poss:
            if need_position: return None, None
            return

        max_width = np.max([right-left for (_, left, right, _, _) in poss])
        GAP = 6
        pos = poss[0]
        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3]-120), max(pos[3]-GAP, 0)))
        pos = poss[-1]
        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+GAP), min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+120)))

        positions = []
        for ii, (pns, left, right, top, bottom) in enumerate(poss):
            right = left + max_width
            bottom *= ZM
            for pn in pns[1:]:
                bottom += self.page_images[pn - 1].size[1]
            imgs.append(
                self.page_images[pns[0]].crop((left * ZM, top * ZM,
                                               right *
                                               ZM, min(
                    bottom, self.page_images[pns[0]].size[1])
                                               ))
            )
            if 0 < ii < len(poss)-1:
                positions.append((pns[0]+self.page_from, left, right, top, min(
                    bottom, self.page_images[pns[0]].size[1])/ZM))
            bottom -= self.page_images[pns[0]].size[1]
            for pn in pns[1:]:
                imgs.append(
                    self.page_images[pn].crop((left * ZM, 0,
                                               right * ZM,
                                               min(bottom,
                                                   self.page_images[pn].size[1])
                                               ))
                )
                if 0 < ii < len(poss) - 1:
                    positions.append((pn+self.page_from, left, right, 0, min(
                        bottom, self.page_images[pn].size[1]) / ZM))
                bottom -= self.page_images[pn].size[1]

        if not imgs:
            if need_position: return None, None
            return
        height = 0
        for img in imgs:
            height += img.size[1] + GAP
        height = int(height)
        width = int(np.max([i.size[0] for i in imgs]))
        pic = Image.new("RGB",
                        (width, height),
                        (245, 245, 245))
        height = 0
        for ii, img in enumerate(imgs):
            if ii == 0 or ii + 1 == len(imgs):
                img = img.convert('RGBA')
                overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
                overlay.putalpha(128)
                img = Image.alpha_composite(img, overlay).convert("RGB")
            pic.paste(img, (0, int(height)))
            height += img.size[1] + GAP

        if need_position:
            return pic, positions
        return pic


if __name__ == "__main__":
    pass
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								# -*- coding: utf-8 -*-
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								import random
-												llm configuation refine and trievalTest API refine (#40)


											
										
										
											2024-01-19 19:51:57 +08:00
+								import fitz
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								import xgboost as xgb
-												use minio to store uploaded files; build dialog server; (#16)

* format code

* use minio to store uploaded files; build dialog server;
											
										
										
											2023-12-25 19:05:59 +08:00
+								from io import BytesIO
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								import torch
 								import re
 								import pdfplumber
 								import logging
-												solve task execution issues (#90)


											
										
										
											2024-03-01 19:48:01 +08:00
+								from PIL import Image, ImageDraw
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								import numpy as np
-												remove unused codes, seperate layout detection out as a new api. Add new rag methed 'table' (#55)


											
										
										
											2024-02-05 18:08:17 +08:00
 								from api.db import ParserType
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								from deepdoc.vision import OCR, Recognizer, LayoutRecognizer, TableStructureRecognizer
-												build python version rag-flow (#21)

* clean rust version project

* clean rust version project

* build python version rag-flow
											
										
										
											2024-01-15 08:46:22 +08:00
+								from rag.nlp import huqie
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								from copy import deepcopy
 								from huggingface_hub import hf_hub_download
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
-												remove unused codes, seperate layout detection out as a new api. Add new rag methed 'table' (#55)


											
										
										
											2024-02-05 18:08:17 +08:00
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								logging.getLogger("pdfminer").setLevel(logging.WARNING)
 								class HuParser:
 								    def __init__(self):
-												use onnx models, new deepdoc (#68)


											
										
										
											2024-02-21 16:32:38 +08:00
+								        self.ocr = OCR()
-												resolve the issue of naive parser (#87)


											
										
										
											2024-02-29 18:53:02 +08:00
+								        if hasattr(self, "model_speciess"):
 								            self.layouter = LayoutRecognizer("layout."+self.model_speciess)
 								        else:
 								            self.layouter = LayoutRecognizer("layout")
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								        self.tbl_det = TableStructureRecognizer()
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
 								        self.updown_cnt_mdl = xgb.Booster()
 								        if torch.cuda.is_available():
 								            self.updown_cnt_mdl.set_param({"device": "cuda"})
 								        self.updown_cnt_mdl.load_model(hf_hub_download(repo_id="InfiniFlow/text_concat_xgb_v1.0",
 								                                                       filename="updown_concat_xgb.model"))
-												fix position extraction bug (#93)

* fix position extraction bug

* remove delimiter for naive parser
											
										
										
											2024-03-04 17:08:35 +08:00
+								        self.page_from = 0
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        """
 								        If you have trouble downloading HuggingFace models, -_^ this might help!!
 								        For Linux:
 								        export HF_ENDPOINT=https://hf-mirror.com
 								        For Windows:
 								        Good luck
 								        ^_-
 								        """
 								    def __char_width(self, c):
 								        return (c["x1"] - c["x0"]) // len(c["text"])
 								    def __height(self, c):
 								        return c["bottom"] - c["top"]
 								    def _x_dis(self, a, b):
 								        return min(abs(a["x1"] - b["x0"]), abs(a["x0"] - b["x1"]),
 								                   abs(a["x0"] + a["x1"] - b["x0"] - b["x1"]) / 2)
 								    def _y_dis(
 								            self, a, b):
 								        return (
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								                       b["top"] + b["bottom"] - a["top"] - a["bottom"]) / 2
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
 								    def _match_proj(self, b):
 								        proj_patt = [
 								            r"第[零一二三四五六七八九十百]+章",
 								            r"第[零一二三四五六七八九十百]+[条节]",
 								            r"[零一二三四五六七八九十百]+[、是 　]",
 								            r"[\(（][零一二三四五六七八九十百]+[）\)]",
 								            r"[\(（][0-9]+[）\)]",
 								            r"[0-9]+(、|\.[　 ]|）|\.[^0-9./a-zA-Z_%><-]{4,})",
 								            r"[0-9]+\.[0-9.]+(、|\.[ 　])",
 								            r"[⚫•➢①② ]",
 								        ]
 								        return any([re.match(p, b["text"]) for p in proj_patt])
 								    def _updown_concat_features(self, up, down):
 								        w = max(self.__char_width(up), self.__char_width(down))
 								        h = max(self.__height(up), self.__height(down))
 								        y_dis = self._y_dis(up, down)
 								        LEN = 6
 								        tks_down = huqie.qie(down["text"][:LEN]).split(" ")
 								        tks_up = huqie.qie(up["text"][-LEN:]).split(" ")
 								        tks_all = up["text"][-LEN:].strip() \
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								                  + (" " if re.match(r"[a-zA-Z0-9]+",
 								                                     up["text"][-1] + down["text"][0]) else "") \
 								                  + down["text"][:LEN].strip()
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        tks_all = huqie.qie(tks_all).split(" ")
 								        fea = [
 								            up.get("R", -1) == down.get("R", -1),
 								            y_dis / h,
 								            down["page_number"] - up["page_number"],
 								            up["layout_type"] == down["layout_type"],
 								            up["layout_type"] == "text",
 								            down["layout_type"] == "text",
 								            up["layout_type"] == "table",
 								            down["layout_type"] == "table",
 								            True if re.search(
 								                r"([。？！；!?;+)）]|[a-z]\.)$",
 								                up["text"]) else False,
 								            True if re.search(r"[，：‘“、0-9（+-]$", up["text"]) else False,
 								            True if re.search(
 								                r"(^.?[/,?;:\]，。；：’”？！》】）-])",
 								                down["text"]) else False,
 								            True if re.match(r"[\(（][^\(\)（）]+[）\)]$", up["text"]) else False,
 								            True if re.search(r"[，,][^。.]+$", up["text"]) else False,
 								            True if re.search(r"[，,][^。.]+$", up["text"]) else False,
 								            True if re.search(r"[\(（][^\)）]+$", up["text"])
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								                    and re.search(r"[\)）]", down["text"]) else False,
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            self._match_proj(down),
 								            True if re.match(r"[A-Z]", down["text"]) else False,
 								            True if re.match(r"[A-Z]", up["text"][-1]) else False,
 								            True if re.match(r"[a-z0-9]", up["text"][-1]) else False,
 								            True if re.match(r"[0-9.%,-]+$", down["text"]) else False,
 								            up["text"].strip()[-2:] == down["text"].strip()[-2:] if len(up["text"].strip()
 								                                                                        ) > 1 and len(
 								                down["text"].strip()) > 1 else False,
 								            up["x0"] > down["x1"],
 								            abs(self.__height(up) - self.__height(down)) / min(self.__height(up),
 								                                                               self.__height(down)),
 								            self._x_dis(up, down) / max(w, 0.000001),
 								            (len(up["text"]) - len(down["text"])) /
 								            max(len(up["text"]), len(down["text"])),
 								            len(tks_all) - len(tks_up) - len(tks_down),
 								            len(tks_down) - len(tks_up),
 								            tks_down[-1] == tks_up[-1],
 								            max(down["in_row"], up["in_row"]),
 								            abs(down["in_row"] - up["in_row"]),
 								            len(tks_down) == 1 and huqie.tag(tks_down[0]).find("n") >= 0,
 								            len(tks_up) == 1 and huqie.tag(tks_up[0]).find("n") >= 0
 								        ]
 								        return fea
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								    @staticmethod
 								    def sort_X_by_page(arr, threashold):
 								        # sort using y1 first and then x1
 								        arr = sorted(arr, key=lambda r: (r["page_number"], r["x0"], r["top"]))
 								        for i in range(len(arr) - 1):
 								            for j in range(i, -1, -1):
 								                # restore the order using th
 								                if abs(arr[j + 1]["x0"] - arr[j]["x0"]) < threashold \
 								                        and arr[j + 1]["top"] < arr[j]["top"]\
 								                        and arr[j + 1]["page_number"] == arr[j]["page_number"]:
 								                    tmp = arr[j]
 								                    arr[j] = arr[j + 1]
 								                    arr[j + 1] = tmp
 								        return arr
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								    def _has_color(self, o):
 								        if o.get("ncs", "") == "DeviceGray":
 								            if o["stroking_color"] and o["stroking_color"][0] == 1 and o["non_stroking_color"] and \
 								                    o["non_stroking_color"][0] == 1:
 								                if re.match(r"[a-zT_\[\]\(\)-]+", o.get("text", "")):
 								                    return False
 								        return True
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								    def _table_transformer_job(self, ZM):
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        logging.info("Table processing...")
 								        imgs, pos = [], []
 								        tbcnt = [0]
 								        MARGIN = 10
 								        self.tb_cpns = []
 								        assert len(self.page_layout) == len(self.page_images)
 								        for p, tbls in enumerate(self.page_layout):  # for page
 								            tbls = [f for f in tbls if f["type"] == "table"]
 								            tbcnt.append(len(tbls))
 								            if not tbls:
 								                continue
 								            for tb in tbls:  # for table
 								                left, top, right, bott = tb["x0"] - MARGIN, tb["top"] - MARGIN, \
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								                                         tb["x1"] + MARGIN, tb["bottom"] + MARGIN
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                left *= ZM
 								                top *= ZM
 								                right *= ZM
 								                bott *= ZM
 								                pos.append((left, top))
 								                imgs.append(self.page_images[p].crop((left, top, right, bott)))
 								        assert len(self.page_images) == len(tbcnt) - 1
 								        if not imgs:
 								            return
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								        recos = self.tbl_det(imgs)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        tbcnt = np.cumsum(tbcnt)
 								        for i in range(len(tbcnt) - 1):  # for page
 								            pg = []
 								            for j, tb_items in enumerate(
 								                    recos[tbcnt[i]: tbcnt[i + 1]]):  # for table
 								                poss = pos[tbcnt[i]: tbcnt[i + 1]]
 								                for it in tb_items:  # for table components
 								                    it["x0"] = (it["x0"] + poss[j][0])
 								                    it["x1"] = (it["x1"] + poss[j][0])
 								                    it["top"] = (it["top"] + poss[j][1])
 								                    it["bottom"] = (it["bottom"] + poss[j][1])
 								                    for n in ["x0", "x1", "top", "bottom"]:
 								                        it[n] /= ZM
 								                    it["top"] += self.page_cum_height[i]
 								                    it["bottom"] += self.page_cum_height[i]
 								                    it["pn"] = i
 								                    it["layoutno"] = j
 								                    pg.append(it)
 								            self.tb_cpns.extend(pg)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								        def gather(kwd, fzy=10, ption=0.6):
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								            eles = Recognizer.sort_Y_firstly(
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								                [r for r in self.tb_cpns if re.match(kwd, r["label"])], fzy)
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								            eles = Recognizer.layouts_cleanup(self.boxes, eles, 5, ption)
 								            return Recognizer.sort_Y_firstly(eles, 0)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
 								        # add R,H,C,SP tag to boxes within table layout
 								        headers = gather(r".*header$")
 								        rows = gather(r".* (row|header)")
 								        spans = gather(r".*spanning")
 								        clmns = sorted([r for r in self.tb_cpns if re.match(
 								            r"table column$", r["label"])], key=lambda x: (x["pn"], x["layoutno"], x["x0"]))
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								        clmns = Recognizer.layouts_cleanup(self.boxes, clmns, 5, 0.5)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								        for b in self.boxes:
 								            if b.get("layout_type", "") != "table":
 								                continue
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								            ii = Recognizer.find_overlapped_with_threashold(b, rows, thr=0.3)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								            if ii is not None:
 								                b["R"] = ii
 								                b["R_top"] = rows[ii]["top"]
 								                b["R_bott"] = rows[ii]["bottom"]
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								            ii = Recognizer.find_overlapped_with_threashold(b, headers, thr=0.3)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								            if ii is not None:
 								                b["H_top"] = headers[ii]["top"]
 								                b["H_bott"] = headers[ii]["bottom"]
 								                b["H_left"] = headers[ii]["x0"]
 								                b["H_right"] = headers[ii]["x1"]
 								                b["H"] = ii
-												refine admin initialization (#75)


											
										
										
											2024-02-27 14:57:34 +08:00
+								            ii = Recognizer.find_horizontally_tightest_fit(b, clmns)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								            if ii is not None:
 								                b["C"] = ii
 								                b["C_left"] = clmns[ii]["x0"]
 								                b["C_right"] = clmns[ii]["x1"]
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								            ii = Recognizer.find_overlapped_with_threashold(b, spans, thr=0.3)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								            if ii is not None:
 								                b["H_top"] = spans[ii]["top"]
 								                b["H_bott"] = spans[ii]["bottom"]
 								                b["H_left"] = spans[ii]["x0"]
 								                b["H_right"] = spans[ii]["x1"]
 								                b["SP"] = ii
-												use onnx models, new deepdoc (#68)


											
										
										
											2024-02-21 16:32:38 +08:00
+								    def __ocr(self, pagenum, img, chars, ZM=3):
 								        bxs = self.ocr(np.array(img))
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        if not bxs:
 								            self.boxes.append([])
 								            return
 								        bxs = [(line[0], line[1][0]) for line in bxs]
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								        bxs = Recognizer.sort_Y_firstly(
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            [{"x0": b[0][0] / ZM, "x1": b[1][0] / ZM,
 								              "top": b[0][1] / ZM, "text": "", "txt": t,
 								              "bottom": b[-1][1] / ZM,
 								              "page_number": pagenum} for b, t in bxs if b[0][0] <= b[1][0] and b[0][1] <= b[-1][1]],
 								            self.mean_height[-1] / 3
 								        )
 								        # merge chars in the same rect
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								        for c in Recognizer.sort_X_firstly(chars, self.mean_width[pagenum - 1] // 4):
 								            ii = Recognizer.find_overlapped(c, bxs)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            if ii is None:
 								                self.lefted_chars.append(c)
 								                continue
 								            ch = c["bottom"] - c["top"]
 								            bh = bxs[ii]["bottom"] - bxs[ii]["top"]
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								            if abs(ch - bh) / max(ch, bh) >= 0.7 and c["text"] != ' ':
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                self.lefted_chars.append(c)
 								                continue
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								            if c["text"] == " " and bxs[ii]["text"]:
 								                if re.match(r"[0-9a-zA-Z,.?;:!%%]", bxs[ii]["text"][-1]): bxs[ii]["text"] += " "
 								            else:
 								                bxs[ii]["text"] += c["text"]
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
 								        for b in bxs:
 								            if not b["text"]:
 								                b["text"] = b["txt"]
 								            del b["txt"]
 								        if self.mean_height[-1] == 0:
 								            self.mean_height[-1] = np.median([b["bottom"] - b["top"]
 								                                              for b in bxs])
 								        self.boxes.append(bxs)
-												use onnx models, new deepdoc (#68)


											
										
										
											2024-02-21 16:32:38 +08:00
+								    def _layouts_rec(self, ZM):
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        assert len(self.page_images) == len(self.boxes)
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								        self.boxes, self.page_layout = self.layouter(self.page_images, self.boxes, ZM)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								        # cumlative Y
 								        for i in range(len(self.boxes)):
 								            self.boxes[i]["top"] += \
 								                self.page_cum_height[self.boxes[i]["page_number"] - 1]
 								            self.boxes[i]["bottom"] += \
 								                self.page_cum_height[self.boxes[i]["page_number"] - 1]
 								    def _text_merge(self):
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        # merge adjusted boxes
 								        bxs = self.boxes
 								        def end_with(b, txt):
 								            txt = txt.strip()
 								            tt = b.get("text", "").strip()
 								            return tt and tt.find(txt) == len(tt) - len(txt)
 								        def start_with(b, txts):
 								            tt = b.get("text", "").strip()
 								            return tt and any([tt.find(t.strip()) == 0 for t in txts])
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								        # horizontally merge adjacent box with the same layout
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        i = 0
 								        while i < len(bxs) - 1:
 								            b = bxs[i]
 								            b_ = bxs[i + 1]
 								            if b.get("layoutno", "0") != b_.get("layoutno", "1"):
 								                i += 1
 								                continue
 								            dis_thr = 1
 								            dis = b["x1"] - b_["x0"]
 								            if b.get("layout_type", "") != "text" or b_.get(
 								                    "layout_type", "") != "text":
 								                if end_with(b, "，") or start_with(b_, "（，"):
 								                    dis_thr = -8
 								                else:
 								                    i += 1
 								                    continue
 								            if abs(self._y_dis(b, b_)) < self.mean_height[bxs[i]["page_number"] - 1] / 5 \
 								                    and dis >= dis_thr and b["x1"] < b_["x1"]:
 								                # merge
 								                bxs[i]["x1"] = b_["x1"]
 								                bxs[i]["top"] = (b["top"] + b_["top"]) / 2
 								                bxs[i]["bottom"] = (b["bottom"] + b_["bottom"]) / 2
 								                bxs[i]["text"] += b_["text"]
 								                bxs.pop(i + 1)
 								                continue
 								            i += 1
 								        self.boxes = bxs
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								    def _naive_vertical_merge(self):
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								        bxs = Recognizer.sort_Y_firstly(self.boxes, np.median(self.mean_height) / 3)
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								        i = 0
 								        while i + 1 < len(bxs):
 								            b = bxs[i]
 								            b_ = bxs[i + 1]
 								            if b["page_number"] < b_["page_number"] and re.match(r"[0-9  •一—-]+$", b["text"]):
 								                bxs.pop(i)
 								                continue
-												change callback strategy, add timezone to docker (#96)


											
										
										
											2024-03-05 12:08:41 +08:00
+								            if not b["text"].strip():
 								                bxs.pop(i)
 								                continue
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								            concatting_feats = [
 								                b["text"].strip()[-1] in ",;:'\"，、‘“；：-",
 								                len(b["text"].strip()) > 1 and b["text"].strip()[-2] in ",;:'\"，‘“、；：",
 								                b["text"].strip()[0] in "。；？！?”）),，、：",
 								            ]
 								            # features for not concating
 								            feats = [
 								                b.get("layoutno", 0) != b.get("layoutno", 0),
 								                b["text"].strip()[-1] in "。？！?",
 								                self.is_english and b["text"].strip()[-1] in ".!?",
 								                b["page_number"] == b_["page_number"] and b_["top"] - \
 								                b["bottom"] > self.mean_height[b["page_number"] - 1] * 1.5,
 								                b["page_number"] < b_["page_number"] and abs(
 								                    b["x0"] - b_["x0"]) > self.mean_width[b["page_number"] - 1] * 4
 								            ]
 								            if any(feats) and not any(concatting_feats):
 								                i += 1
 								                continue
 								            # merge up and down
 								            b["bottom"] = b_["bottom"]
 								            b["text"] += b_["text"]
 								            b["x0"] = min(b["x0"], b_["x0"])
 								            b["x1"] = max(b["x1"], b_["x1"])
 								            bxs.pop(i + 1)
 								        self.boxes = bxs
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								    def _concat_downward(self, concat_between_pages=True):
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								        # count boxes in the same row as a feature
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        for i in range(len(self.boxes)):
 								            mh = self.mean_height[self.boxes[i]["page_number"] - 1]
 								            self.boxes[i]["in_row"] = 0
 								            j = max(0, i - 12)
 								            while j < min(i + 12, len(self.boxes)):
 								                if j == i:
 								                    j += 1
 								                    continue
 								                ydis = self._y_dis(self.boxes[i], self.boxes[j]) / mh
 								                if abs(ydis) < 1:
 								                    self.boxes[i]["in_row"] += 1
 								                elif ydis > 0:
 								                    break
 								                j += 1
 								        # concat between rows
 								        boxes = deepcopy(self.boxes)
 								        blocks = []
 								        while boxes:
 								            chunks = []
 								            def dfs(up, dp):
 								                chunks.append(up)
 								                i = dp
 								                while i < min(dp + 12, len(boxes)):
 								                    ydis = self._y_dis(up, boxes[i])
 								                    smpg = up["page_number"] == boxes[i]["page_number"]
 								                    mh = self.mean_height[up["page_number"] - 1]
 								                    mw = self.mean_width[up["page_number"] - 1]
 								                    if smpg and ydis > mh * 4:
 								                        break
 								                    if not smpg and ydis > mh * 16:
 								                        break
 								                    down = boxes[i]
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								                    if not concat_between_pages and down["page_number"] > up["page_number"]:
 								                        break
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
 								                    if up.get("R", "") != down.get(
 								                            "R", "") and up["text"][-1] != "，":
 								                        i += 1
 								                        continue
 								                    if re.match(r"[0-9]{2,3}/[0-9]{3}$", up["text"]) \
 								                            or re.match(r"[0-9]{2,3}/[0-9]{3}$", down["text"]):
 								                        i += 1
 								                        continue
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								                    if not down["text"].strip():
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                        i += 1
 								                        continue
 								                    if up["x1"] < down["x0"] - 10 * \
 								                            mw or up["x0"] > down["x1"] + 10 * mw:
 								                        i += 1
 								                        continue
 								                    if i - dp < 5 and up.get("layout_type") == "text":
 								                        if up.get("layoutno", "1") == down.get(
 								                                "layoutno", "2"):
 								                            dfs(down, i + 1)
 								                            boxes.pop(i)
 								                            return
 								                        i += 1
 								                        continue
 								                    fea = self._updown_concat_features(up, down)
 								                    if self.updown_cnt_mdl.predict(
 								                            xgb.DMatrix([fea]))[0] <= 0.5:
 								                        i += 1
 								                        continue
 								                    dfs(down, i + 1)
 								                    boxes.pop(i)
 								                    return
 								            dfs(boxes[0], 1)
 								            boxes.pop(0)
 								            if chunks:
 								                blocks.append(chunks)
 								        # concat within each block
 								        boxes = []
 								        for b in blocks:
 								            if len(b) == 1:
 								                boxes.append(b[0])
 								                continue
 								            t = b[0]
 								            for c in b[1:]:
 								                t["text"] = t["text"].strip()
 								                c["text"] = c["text"].strip()
 								                if not c["text"]:
 								                    continue
 								                if t["text"] and re.match(
 								                        r"[0-9\.a-zA-Z]+$", t["text"][-1] + c["text"][-1]):
 								                    t["text"] += " "
 								                t["text"] += c["text"]
 								                t["x0"] = min(t["x0"], c["x0"])
 								                t["x1"] = max(t["x1"], c["x1"])
 								                t["page_number"] = min(t["page_number"], c["page_number"])
 								                t["bottom"] = c["bottom"]
 								                if not t["layout_type"] \
 								                        and c["layout_type"]:
 								                    t["layout_type"] = c["layout_type"]
 								            boxes.append(t)
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								        self.boxes = Recognizer.sort_Y_firstly(boxes, 0)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								    def _filter_forpages(self):
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        if not self.boxes:
 								            return
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								        findit = False
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								        i = 0
 								        while i < len(self.boxes):
-												Add Q&A and Book, fix task running bugs (#50)


											
										
										
											2024-02-01 18:53:56 +08:00
+								            if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", re.sub(r"( | |\u3000)+", "", self.boxes[i]["text"].lower())):
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								                i += 1
 								                continue
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								            findit = True
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								            eng = re.match(r"[0-9a-zA-Z :'.-]{5,}", self.boxes[i]["text"].strip())
 								            self.boxes.pop(i)
 								            if i >= len(self.boxes): break
 								            prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
 								            while not prefix:
 								                self.boxes.pop(i)
 								                if i >= len(self.boxes): break
 								                prefix = self.boxes[i]["text"].strip()[:3] if not eng else " ".join(self.boxes[i]["text"].strip().split(" ")[:2])
 								            self.boxes.pop(i)
 								            if i >= len(self.boxes) or not prefix: break
 								            for j in range(i, min(i + 128, len(self.boxes))):
 								                if not re.match(prefix, self.boxes[j]["text"]):
 								                    continue
 								                for k in range(i, j): self.boxes.pop(i)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                break
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								        if findit:return
 								        page_dirty = [0] * len(self.page_images)
 								        for b in self.boxes:
 								            if re.search(r"(··|··|··)", b["text"]):
 								                page_dirty[b["page_number"]-1] += 1
 								        page_dirty = set([i+1 for i, t in enumerate(page_dirty) if t > 3])
 								        if not page_dirty: return
 								        i = 0
 								        while i < len(self.boxes):
 								            if self.boxes[i]["page_number"] in page_dirty:
 								                self.boxes.pop(i)
 								                continue
 								            i += 1
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
-												Add Q&A and Book, fix task running bugs (#50)


											
										
										
											2024-02-01 18:53:56 +08:00
+								    def _merge_with_same_bullet(self):
 								        i = 0
 								        while i + 1 < len(self.boxes):
 								            b = self.boxes[i]
 								            b_ = self.boxes[i + 1]
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								            if not b["text"].strip():
 								                self.boxes.pop(i)
 								                continue
 								            if not b_["text"].strip():
 								                self.boxes.pop(i+1)
 								                continue
-												Add Q&A and Book, fix task running bugs (#50)


											
										
										
											2024-02-01 18:53:56 +08:00
+								            if b["text"].strip()[0] != b_["text"].strip()[0] \
 								                    or b["text"].strip()[0].lower() in set("qwertyuopasdfghjklzxcvbnm") \
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								                    or huqie.is_chinese(b["text"].strip()[0]) \
-												Add Q&A and Book, fix task running bugs (#50)


											
										
										
											2024-02-01 18:53:56 +08:00
+								                    or b["top"] > b_["bottom"]:
 								                i += 1
 								                continue
 								            b_["text"] = b["text"] + "\n" + b_["text"]
 								            b_["x0"] = min(b["x0"], b_["x0"])
 								            b_["x1"] = max(b["x1"], b_["x1"])
 								            b_["top"] = b["top"]
 								            self.boxes.pop(i)
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								    def _extract_table_figure(self, need_image, ZM, return_html, need_position):
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        tables = {}
 								        figures = {}
 								        # extract figure and table boxes
 								        i = 0
 								        lst_lout_no = ""
 								        nomerge_lout_no = []
 								        while i < len(self.boxes):
 								            if "layoutno" not in self.boxes[i]:
 								                i += 1
 								                continue
 								            lout_no = str(self.boxes[i]["page_number"]) + \
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								                      "-" + str(self.boxes[i]["layoutno"])
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								            if TableStructureRecognizer.is_caption(self.boxes[i]) or self.boxes[i]["layout_type"] in ["table caption", "title",
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                                                                                  "figure caption", "reference"]:
 								                nomerge_lout_no.append(lst_lout_no)
 								            if self.boxes[i]["layout_type"] == "table":
 								                if re.match(r"(数据|资料|图表)*来源[:： ]", self.boxes[i]["text"]):
 								                    self.boxes.pop(i)
 								                    continue
 								                if lout_no not in tables:
 								                    tables[lout_no] = []
 								                tables[lout_no].append(self.boxes[i])
 								                self.boxes.pop(i)
 								                lst_lout_no = lout_no
 								                continue
 								            if need_image and self.boxes[i]["layout_type"] == "figure":
 								                if re.match(r"(数据|资料|图表)*来源[:： ]", self.boxes[i]["text"]):
 								                    self.boxes.pop(i)
 								                    continue
 								                if lout_no not in figures:
 								                    figures[lout_no] = []
 								                figures[lout_no].append(self.boxes[i])
 								                self.boxes.pop(i)
 								                lst_lout_no = lout_no
 								                continue
 								            i += 1
 								        # merge table on different pages
 								        nomerge_lout_no = set(nomerge_lout_no)
 								        tbls = sorted([(k, bxs) for k, bxs in tables.items()],
 								                      key=lambda x: (x[1][0]["top"], x[1][0]["x0"]))
 								        i = len(tbls) - 1
 								        while i - 1 >= 0:
 								            k0, bxs0 = tbls[i - 1]
 								            k, bxs = tbls[i]
 								            i -= 1
 								            if k0 in nomerge_lout_no:
 								                continue
 								            if bxs[0]["page_number"] == bxs0[0]["page_number"]:
 								                continue
 								            if bxs[0]["page_number"] - bxs0[0]["page_number"] > 1:
 								                continue
 								            mh = self.mean_height[bxs[0]["page_number"] - 1]
 								            if self._y_dis(bxs0[-1], bxs[0]) > mh * 23:
 								                continue
 								            tables[k0].extend(tables[k])
 								            del tables[k]
 								        def x_overlapped(a, b):
 								            return not any([a["x1"] < b["x0"], a["x0"] > b["x1"]])
 								        # find captions and pop out
 								        i = 0
 								        while i < len(self.boxes):
 								            c = self.boxes[i]
 								            # mh = self.mean_height[c["page_number"]-1]
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								            if not TableStructureRecognizer.is_caption(c):
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                i += 1
 								                continue
 								            # find the nearest layouts
 								            def nearest(tbls):
 								                nonlocal c
 								                mink = ""
 								                minv = 1000000000
 								                for k, bxs in tbls.items():
 								                    for b in bxs[:10]:
 								                        if b.get("layout_type", "").find("caption") >= 0:
 								                            continue
 								                        y_dis = self._y_dis(c, b)
 								                        x_dis = self._x_dis(
 								                            c, b) if not x_overlapped(
 								                            c, b) else 0
 								                        dis = y_dis * y_dis + x_dis * x_dis
 								                        if dis < minv:
 								                            mink = k
 								                            minv = dis
 								                return mink, minv
 								            tk, tv = nearest(tables)
 								            fk, fv = nearest(figures)
 								            if min(tv, fv) > 2000:
 								                i += 1
 								                continue
 								            if tv < fv:
 								                tables[tk].insert(0, c)
 								                logging.debug(
 								                    "TABLE:" +
 								                    self.boxes[i]["text"] +
 								                    "; Cap: " +
 								                    tk)
 								            else:
 								                figures[fk].insert(0, c)
 								                logging.debug(
 								                    "FIGURE:" +
 								                    self.boxes[i]["text"] +
 								                    "; Cap: " +
 								                    tk)
 								            self.boxes.pop(i)
 								        res = []
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								        positions = []
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								        def cropout(bxs, ltype, poss):
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            nonlocal ZM
 								            pn = set([b["page_number"] - 1 for b in bxs])
 								            if len(pn) < 2:
 								                pn = list(pn)[0]
 								                ht = self.page_cum_height[pn]
 								                b = {
 								                    "x0": np.min([b["x0"] for b in bxs]),
 								                    "top": np.min([b["top"] for b in bxs]) - ht,
 								                    "x1": np.max([b["x1"] for b in bxs]),
 								                    "bottom": np.max([b["bottom"] for b in bxs]) - ht
 								                }
 								                louts = [l for l in self.page_layout[pn] if l["type"] == ltype]
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								                ii = Recognizer.find_overlapped(b, louts, naive=True)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                if ii is not None:
 								                    b = louts[ii]
 								                else:
 								                    logging.warn(
 								                        f"Missing layout match: {pn + 1},%s" %
 								                        (bxs[0].get(
 								                            "layoutno", "")))
 								                left, top, right, bott = b["x0"], b["top"], b["x1"], b["bottom"]
-												fix position extraction bug (#93)

* fix position extraction bug

* remove delimiter for naive parser
											
										
										
											2024-03-04 17:08:35 +08:00
+								                poss.append((pn+self.page_from, left, right, top, bott))
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                return self.page_images[pn] \
 								                    .crop((left * ZM, top * ZM,
 								                           right * ZM, bott * ZM))
 								            pn = {}
 								            for b in bxs:
 								                p = b["page_number"] - 1
 								                if p not in pn:
 								                    pn[p] = []
 								                pn[p].append(b)
 								            pn = sorted(pn.items(), key=lambda x: x[0])
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								            imgs = [cropout(arr, ltype, poss) for p, arr in pn]
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            pic = Image.new("RGB",
 								                            (int(np.max([i.size[0] for i in imgs])),
 								                             int(np.sum([m.size[1] for m in imgs]))),
 								                            (245, 245, 245))
 								            height = 0
 								            for img in imgs:
 								                pic.paste(img, (0, int(height)))
 								                height += img.size[1]
 								            return pic
 								        # crop figure out and add caption
 								        for k, bxs in figures.items():
 								            txt = "\n".join(
 								                [b["text"] for b in bxs
 								                 if not re.match(r"[0-9a-z.\+%-]", b["text"].strip())
 								                 and len(b["text"].strip()) >= 4
 								                 ]
 								            )
 								            if not txt:
 								                continue
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								            poss = []
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            res.append(
 								                (cropout(
 								                    bxs,
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								                    "figure", poss),
-												fix task cancling bug (#98)


											
										
										
											2024-03-05 16:33:47 +08:00
+								                 [txt]))
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								            positions.append(poss)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
 								        for k, bxs in tables.items():
 								            if not bxs:
 								                continue
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								            bxs = Recognizer.sort_Y_firstly(bxs, np.mean([(b["bottom"]-b["top"])/2 for b in bxs]))
 								            poss = []
 								            res.append((cropout(bxs, "table", poss),
-												rename vision, add layour and tsr recognizer (#70)

* rename vision, add layour and tsr recognizer

* trivial fixing
											
										
										
											2024-02-22 19:11:37 +08:00
+								                        self.tbl_det.construct_table(bxs, html=return_html, is_english=self.is_english)))
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								            positions.append(poss)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								        assert len(positions) == len(res)
 								        if need_position: return list(zip(res, positions))
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        return res
 								    def proj_match(self, line):
 								        if len(line) <= 2:
 								            return
 								        if re.match(r"[0-9 ().,%%+/-]+$", line):
 								            return False
 								        for p, j in [
 								            (r"第[零一二三四五六七八九十百]+章", 1),
 								            (r"第[零一二三四五六七八九十百]+[条节]", 2),
 								            (r"[零一二三四五六七八九十百]+[、 　]", 3),
 								            (r"[\(（][零一二三四五六七八九十百]+[）\)]", 4),
 								            (r"[0-9]+(、|\.[　 ]|\.[^0-9])", 5),
 								            (r"[0-9]+\.[0-9]+(、|[. 　]|[^0-9])", 6),
 								            (r"[0-9]+\.[0-9]+\.[0-9]+(、|[ 　]|[^0-9])", 7),
 								            (r"[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(、|[ 　]|[^0-9])", 8),
 								            (r".{,48}[：:?？]$", 9),
 								            (r"[0-9]+）", 10),
 								            (r"[\(（][0-9]+[）\)]", 11),
 								            (r"[零一二三四五六七八九十百]+是", 12),
 								            (r"[⚫•➢✓]", 12)
 								        ]:
 								            if re.match(p, line):
 								                return j
 								        return
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								    def _line_tag(self, bx, ZM):
 								        pn = [bx["page_number"]]
 								        top = bx["top"] - self.page_cum_height[pn[0] - 1]
 								        bott = bx["bottom"] - self.page_cum_height[pn[0] - 1]
 								        while bott * ZM > self.page_images[pn[-1] - 1].size[1]:
 								            bott -= self.page_images[pn[-1] - 1].size[1] / ZM
 								            pn.append(pn[-1] + 1)
 								        return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
 								            .format("-".join([str(p) for p in pn]),
 								                    bx["x0"], bx["x1"], top, bott)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								    def __filterout_scraps(self, boxes, ZM):
 								        def width(b):
 								            return b["x1"] - b["x0"]
 								        def height(b):
 								            return b["bottom"] - b["top"]
 								        def usefull(b):
 								            if b.get("layout_type"):
 								                return True
 								            if width(
 								                    b) > self.page_images[b["page_number"] - 1].size[0] / ZM / 3:
 								                return True
 								            if b["bottom"] - b["top"] > self.mean_height[b["page_number"] - 1]:
 								                return True
 								            return False
 								        res = []
 								        while boxes:
 								            lines = []
 								            widths = []
 								            pw = self.page_images[boxes[0]["page_number"] - 1].size[0] / ZM
 								            mh = self.mean_height[boxes[0]["page_number"] - 1]
 								            mj = self.proj_match(
 								                boxes[0]["text"]) or boxes[0].get(
 								                "layout_type",
 								                "") == "title"
 								            def dfs(line, st):
 								                nonlocal mh, pw, lines, widths
 								                lines.append(line)
 								                widths.append(width(line))
 								                width_mean = np.mean(widths)
 								                mmj = self.proj_match(
 								                    line["text"]) or line.get(
 								                    "layout_type",
 								                    "") == "title"
 								                for i in range(st + 1, min(st + 20, len(boxes))):
 								                    if (boxes[i]["page_number"] - line["page_number"]) > 0:
 								                        break
 								                    if not mmj and self._y_dis(
 								                            line, boxes[i]) >= 3 * mh and height(line) < 1.5 * mh:
 								                        break
 								                    if not usefull(boxes[i]):
 								                        continue
 								                    if mmj or \
 								                            (self._x_dis(boxes[i], line) < pw / 10): \
 								                            # and abs(width(boxes[i])-width_mean)/max(width(boxes[i]),width_mean)<0.5):
 								                        # concat following
 								                        dfs(boxes[i], i)
 								                        boxes.pop(i)
 								                        break
 								            try:
 								                if usefull(boxes[0]):
 								                    dfs(boxes[0], 0)
 								                else:
 								                    logging.debug("WASTE: " + boxes[0]["text"])
 								            except Exception as e:
 								                pass
 								            boxes.pop(0)
 								            mw = np.mean(widths)
 								            if mj or mw / pw >= 0.35 or mw > 200:
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								                res.append("\n".join([c["text"] + self._line_tag(c, ZM) for c in lines]))
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            else:
 								                logging.debug("REMOVED: " +
 								                              "<<".join([c["text"] for c in lines]))
 								        return "\n\n".join(res)
-												Add task moduel, and pipline the task and every parser (#49)


											
										
										
											2024-01-31 19:57:45 +08:00
+								    @staticmethod
 								    def total_page_number(fnm, binary=None):
 								        try:
 								            pdf = pdfplumber.open(fnm) if not binary else pdfplumber.open(BytesIO(binary))
 								            return len(pdf.pages)
 								        except Exception as e:
 								            pdf = fitz.open(fnm) if not binary else fitz.open(stream=fnm, filetype="pdf")
 								            return len(pdf)
-												change callback strategy, add timezone to docker (#96)


											
										
										
											2024-03-05 12:08:41 +08:00
+								    def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None):
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        self.lefted_chars = []
 								        self.mean_height = []
 								        self.mean_width = []
 								        self.boxes = []
 								        self.garbages = {}
 								        self.page_cum_height = [0]
 								        self.page_layout = []
-												fix position extraction bug (#93)

* fix position extraction bug

* remove delimiter for naive parser
											
										
										
											2024-03-04 17:08:35 +08:00
+								        self.page_from = page_from
-												llm configuation refine and trievalTest API refine (#40)


											
										
										
											2024-01-19 19:51:57 +08:00
+								        try:
 								            self.pdf = pdfplumber.open(fnm) if isinstance(fnm, str) else pdfplumber.open(BytesIO(fnm))
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								            self.page_images = [p.to_image(resolution=72 * zoomin).annotated for i, p in
 								                                enumerate(self.pdf.pages[page_from:page_to])]
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								            self.page_chars = [[c for c in page.chars if self._has_color(c)] for page in self.pdf.pages[page_from:page_to]]
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								            self.total_page = len(self.pdf.pages)
-												llm configuation refine and trievalTest API refine (#40)


											
										
										
											2024-01-19 19:51:57 +08:00
+								        except Exception as e:
 								            self.pdf = fitz.open(fnm) if isinstance(fnm, str) else fitz.open(stream=fnm, filetype="pdf")
 								            self.page_images = []
 								            self.page_chars = []
 								            mat = fitz.Matrix(zoomin, zoomin)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								            self.total_page = len(self.pdf)
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								            for i, page in enumerate(self.pdf):
 								                if i < page_from:continue
 								                if i >= page_to:break
 								                pix = page.get_pixmap(matrix=mat)
-												llm configuation refine and trievalTest API refine (#40)


											
										
										
											2024-01-19 19:51:57 +08:00
+								                img = Image.frombytes("RGB", [pix.width, pix.height],
 								                                      pix.samples)
 								                self.page_images.append(img)
 								                self.page_chars.append([])
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        logging.info("Images converted.")
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								        self.is_english = [re.search(r"[a-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join(random.choices([c["text"] for c in self.page_chars[i]], k=min(100, len(self.page_chars[i]))))) for i in range(len(self.page_chars))]
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								        if sum([1 if e else 0 for e in self.is_english]) > len(self.page_images) / 2:
 								            self.is_english = True
 								        else:
 								            self.is_english = False
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        for i, img in enumerate(self.page_images):
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								            chars = self.page_chars[i] if not self.is_english else []
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            self.mean_height.append(
 								                np.median(sorted([c["height"] for c in chars])) if chars else 0
 								            )
 								            self.mean_width.append(
 								                np.median(sorted([c["width"] for c in chars])) if chars else 8
 								            )
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								            self.page_cum_height.append(img.size[1] / zoomin)
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								            j = 0
 								            while j + 1 < len(chars):
 								                if chars[j]["text"] and chars[j + 1]["text"] \
 								                        and re.match(r"[0-9a-zA-Z,.:;!%]+", chars[j]["text"] + chars[j + 1]["text"]) \
 								                        and chars[j + 1]["x0"] - chars[j]["x1"] >= min(chars[j + 1]["width"],
 								                                                                       chars[j]["width"]) / 2:
 								                    chars[j]["text"] += " "
 								                j += 1
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								            # if i > 0:
 								            #     if not chars:
 								            #         self.page_cum_height.append(img.size[1] / zoomin)
 								            #     else:
 								            #         self.page_cum_height.append(
 								            #             np.max([c["bottom"] for c in chars]))
-												use onnx models, new deepdoc (#68)


											
										
										
											2024-02-21 16:32:38 +08:00
+								            self.__ocr(i + 1, img, chars, zoomin)
-												change callback strategy, add timezone to docker (#96)


											
										
										
											2024-03-05 12:08:41 +08:00
+								            if callback: callback(prog=(i+1)*0.6/len(self.page_images), msg="")
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
-												Some document API refined. (#53)

Add naive chunking method to RAG
											
										
										
											2024-02-02 19:21:37 +08:00
+								        if not self.is_english and not any([c for c in self.page_chars]) and self.boxes:
-												remove unused codes, seperate layout detection out as a new api. Add new rag methed 'table' (#55)


											
										
										
											2024-02-05 18:08:17 +08:00
+								            bxes = [b for bxs in self.boxes for b in bxs]
 								            self.is_english = re.search(r"[\na-zA-Z0-9,/¸;:'\[\]\(\)!@#$%^&*\"?<>._-]{30,}", "".join([b["text"] for b in random.choices(bxes, k=min(30, len(bxes)))]))
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
 								        logging.info("Is it English:", self.is_english)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        self.page_cum_height = np.cumsum(self.page_cum_height)
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								        assert len(self.page_cum_height) == len(self.page_images) + 1
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								    def __call__(self, fnm, need_image=True, zoomin=3, return_html=False):
 								        self.__images__(fnm, zoomin)
-												use onnx models, new deepdoc (#68)


											
										
										
											2024-02-21 16:32:38 +08:00
+								        self._layouts_rec(zoomin)
-												Add app to rag module: presentaion & laws (#43)


											
										
										
											2024-01-25 18:57:39 +08:00
+								        self._table_transformer_job(zoomin)
 								        self._text_merge()
 								        self._concat_downward()
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								        self._filter_forpages()
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								        tbls = self._extract_table_figure(need_image, zoomin, return_html, False)
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        return self.__filterout_scraps(deepcopy(self.boxes), zoomin), tbls
 								    def remove_tag(self, txt):
 								        return re.sub(r"@@[\t0-9.-]+?##", "", txt)
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								    def crop(self, text, ZM=3, need_position=False):
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        imgs = []
-												solve task execution issues (#90)


											
										
										
											2024-03-01 19:48:01 +08:00
+								        poss = []
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", text):
 								            pn, left, right, top, bottom = tag.strip(
 								                "#").strip("@").split("\t")
 								            left, right, top, bottom = float(left), float(
 								                right), float(top), float(bottom)
-												solve task execution issues (#90)


											
										
										
											2024-03-01 19:48:01 +08:00
+								            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
-												fix position extraction bug (#93)

* fix position extraction bug

* remove delimiter for naive parser
											
										
										
											2024-03-04 17:08:35 +08:00
+								        if not poss:
 								            if need_position: return None, None
 								            return
-												solve task execution issues (#90)


											
										
										
											2024-03-01 19:48:01 +08:00
 								        max_width = np.max([right-left for (_, left, right, _, _) in poss])
 								        GAP = 6
 								        pos = poss[0]
 								        poss.insert(0, ([pos[0][0]], pos[1], pos[2], max(0, pos[3]-120), max(pos[3]-GAP, 0)))
 								        pos = poss[-1]
 								        poss.append(([pos[0][-1]], pos[1], pos[2], min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+GAP), min(self.page_images[pos[0][-1]].size[1]/ZM, pos[4]+120)))
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								        positions = []
-												solve task execution issues (#90)


											
										
										
											2024-03-01 19:48:01 +08:00
+								        for ii, (pns, left, right, top, bottom) in enumerate(poss):
 								            right = left + max_width
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            bottom *= ZM
 								            for pn in pns[1:]:
 								                bottom += self.page_images[pn - 1].size[1]
 								            imgs.append(
 								                self.page_images[pns[0]].crop((left * ZM, top * ZM,
 								                                               right *
 								                                               ZM, min(
-												add paper & manual parser (#46)


											
										
										
											2024-01-30 18:28:09 +08:00
+								                    bottom, self.page_images[pns[0]].size[1])
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                                               ))
 								            )
-												fix position extraction bug (#93)

* fix position extraction bug

* remove delimiter for naive parser
											
										
										
											2024-03-04 17:08:35 +08:00
+								            if 0 < ii < len(poss)-1:
 								                positions.append((pns[0]+self.page_from, left, right, top, min(
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								                    bottom, self.page_images[pns[0]].size[1])/ZM))
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            bottom -= self.page_images[pns[0]].size[1]
 								            for pn in pns[1:]:
 								                imgs.append(
 								                    self.page_images[pn].crop((left * ZM, 0,
 								                                               right * ZM,
 								                                               min(bottom,
 								                                                   self.page_images[pn].size[1])
 								                                               ))
 								                )
-												fix position extraction bug (#93)

* fix position extraction bug

* remove delimiter for naive parser
											
										
										
											2024-03-04 17:08:35 +08:00
+								                if 0 < ii < len(poss) - 1:
 								                    positions.append((pn+self.page_from, left, right, 0, min(
 								                        bottom, self.page_images[pn].size[1]) / ZM))
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                bottom -= self.page_images[pn].size[1]
 								        if not imgs:
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
+								            if need_position: return None, None
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            return
 								        height = 0
 								        for img in imgs:
 								            height += img.size[1] + GAP
 								        height = int(height)
-												solve task execution issues (#90)


											
										
										
											2024-03-01 19:48:01 +08:00
+								        width = int(np.max([i.size[0] for i in imgs]))
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        pic = Image.new("RGB",
-												solve task execution issues (#90)


											
										
										
											2024-03-01 19:48:01 +08:00
+								                        (width, height),
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								                        (245, 245, 245))
 								        height = 0
-												solve task execution issues (#90)


											
										
										
											2024-03-01 19:48:01 +08:00
+								        for ii, img in enumerate(imgs):
 								            if ii == 0 or ii + 1 == len(imgs):
 								                img = img.convert('RGBA')
 								                overlay = Image.new('RGBA', img.size, (0, 0, 0, 0))
 								                overlay.putalpha(128)
 								                img = Image.alpha_composite(img, overlay).convert("RGB")
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								            pic.paste(img, (0, int(height)))
 								            height += img.size[1] + GAP
-												fix table desc bugs, add positions to chunks (#91)


											
										
										
											2024-03-04 14:42:26 +08:00
 								        if need_position:
 								            return pic, positions
-												init python part (#7)


											
										
										
											2023-12-14 19:19:03 +08:00
+								        return pic
 								if __name__ == "__main__":
 								    pass