ragflow/rag/app/book.py

#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import copy
from tika import parser
import re
from io import BytesIO

from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \
    hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \
    tokenize_chunks, find_codec
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser


class Pdf(PdfParser):
    def __call__(self, filename, binary=None, from_page=0,
                 to_page=100000, zoomin=3, callback=None):
        callback(msg="OCR is running...")
        self.__images__(
            filename if not binary else binary,
            zoomin,
            from_page,
            to_page,
            callback)
        callback(msg="OCR finished")

        from timeit import default_timer as timer
        start = timer()
        self._layouts_rec(zoomin)
        callback(0.67, "Layout analysis finished")
        print("layouts:", timer() - start)
        self._table_transformer_job(zoomin)
        callback(0.68, "Table analysis finished")
        self._text_merge()
        tbls = self._extract_table_figure(True, zoomin, True, True)
        self._naive_vertical_merge()
        self._filter_forpages()
        self._merge_with_same_bullet()
        callback(0.75, "Text merging finished.")

        callback(0.8, "Text extraction finished")

        return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
                for b in self.boxes], tbls


def chunk(filename, binary=None, from_page=0, to_page=100000,
          lang="Chinese", callback=None, **kwargs):
    """
        Supported file formats are docx, pdf, txt.
        Since a book is long and not all the parts are useful, if it's a PDF,
        please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.
    """
    doc = {
        "docnm_kwd": filename,
        "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
    }
    doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
    pdf_parser = None
    sections, tbls = [], []
    if re.search(r"\.docx$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        doc_parser = DocxParser()
        # TODO: table of contents need to be removed
        sections, tbls = doc_parser(
            binary if binary else filename, from_page=from_page, to_page=to_page)
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        tbls = [((None, lns), None) for lns in tbls]
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.pdf$", filename, re.IGNORECASE):
        pdf_parser = Pdf() if kwargs.get(
            "parser_config", {}).get(
            "layout_recognize", True) else PlainParser()
        sections, tbls = pdf_parser(filename if not binary else binary,
                                    from_page=from_page, to_page=to_page, callback=callback)

    elif re.search(r"\.txt$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        txt = ""
        if binary:
            encoding = find_codec(binary)
            txt = binary.decode(encoding, errors="ignore")
        else:
            with open(filename, "r") as f:
                while True:
                    l = f.readline()
                    if not l:
                        break
                    txt += l
        sections = txt.split("\n")
        sections = [(l, "") for l in sections if l]
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        sections = HtmlParser()(filename, binary)
        sections = [(l, "") for l in sections if l]
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")

    elif re.search(r"\.doc$", filename, re.IGNORECASE):
        callback(0.1, "Start to parse.")
        binary = BytesIO(binary)
        doc_parsed = parser.from_buffer(binary)
        sections = doc_parsed['content'].split('\n')
        sections = [(l, "") for l in sections if l]
        remove_contents_table(sections, eng=is_english(
            random_choices([t for t, _ in sections], k=200)))
        callback(0.8, "Finish parsing.")

    else:
        raise NotImplementedError(
            "file type not supported yet(doc, docx, pdf, txt supported)")

    make_colon_as_title(sections)
    bull = bullets_category(
        [t for t in random_choices([t for t, _ in sections], k=100)])
    if bull >= 0:
        chunks = ["\n".join(ck)
                  for ck in hierarchical_merge(bull, sections, 5)]
    else:
        sections = [s.split("@") for s, _ in sections]
        sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ]
        chunks = naive_merge(
            sections, kwargs.get(
                "chunk_token_num", 256), kwargs.get(
                "delimer", "\n。；！？"))

    # is it English
    # is_english(random_choices([t for t, _ in sections], k=218))
    eng = lang.lower() == "english"

    res = tokenize_table(tbls, doc, eng)
    res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))

    return res


if __name__ == "__main__":
    import sys

    def dummy(prog=None, msg=""):
        pass
    chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
use onnx models, new deepdoc (#68) 2024-02-21 16:32:38 +08:00			`# Licensed under the Apache License, Version 2.0 (the "License");`
			`# you may not use this file except in compliance with the License.`
			`# You may obtain a copy of the License at`
			`#`
			`# http://www.apache.org/licenses/LICENSE-2.0`
			`#`
			`# Unless required by applicable law or agreed to in writing, software`
			`# distributed under the License is distributed on an "AS IS" BASIS,`
			`# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`# See the License for the specific language governing permissions and`
			`# limitations under the License.`
			`#`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`import copy`
Add `.doc` file parser. (#497) ### What problem does this PR solve? Add `.doc` file parser, using tika. ``` pip install tika ``` ``` from tika import parser from io import BytesIO def extract_text_from_doc_bytes(doc_bytes): file_like_object = BytesIO(doc_bytes) parsed = parser.from_buffer(file_like_object) return parsed["content"] ``` ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: chrysanthemum-boy <fannc@qq.com> 2024-04-23 15:31:43 +08:00			`from tika import parser`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`import re`
add use layout or not option (#145) * add use layout or not option * trival 2024-03-22 19:21:09 +08:00			`from io import BytesIO`

init README of deepdoc, add picture processer. (#71) * init README of deepdoc, add picture processer. * add resume parsing 2024-02-23 18:28:12 +08:00			`from rag.nlp import bullets_category, is_english, tokenize, remove_contents_table, \`
Fit a lot of encodings for text file. (#458) ### What problem does this PR solve? #384 ### Type of change - [x] Performance Improvement 2024-04-19 18:02:53 +08:00			`hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, add_positions, \`
			`tokenize_chunks, find_codec`
refine code (#595) ### What problem does this PR solve? ### Type of change - [x] Refactoring 2024-04-28 19:13:33 +08:00			`from rag.nlp import rag_tokenizer`
Add support for HTML file (#973) ### What problem does this PR solve? Add support for HTML file ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2024-05-30 09:12:55 +08:00			`from deepdoc.parser import PdfParser, DocxParser, PlainParser, HtmlParser`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00

use onnx models, new deepdoc (#68) 2024-02-21 16:32:38 +08:00			`class Pdf(PdfParser):`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`def __call__(self, filename, binary=None, from_page=0,`
			`to_page=100000, zoomin=3, callback=None):`
Update version info (#564) ### What problem does this PR solve? _Briefly describe what this PR aims to solve. Include background context that will help reviewers understand the purpose of the PR._ ### Type of change - [x] Documentation Update - [x] Refactoring --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com> 2024-04-26 20:07:26 +08:00			`callback(msg="OCR is running...")`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`self.__images__(`
			`filename if not binary else binary,`
			`zoomin,`
			`from_page,`
change callback strategy, add timezone to docker (#96) 2024-03-05 12:08:41 +08:00			`to_page,`
			`callback)`
fix task cancling bug (#98) 2024-03-05 16:33:47 +08:00			`callback(msg="OCR finished")`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00
			`from timeit import default_timer as timer`
			`start = timer()`
use onnx models, new deepdoc (#68) 2024-02-21 16:32:38 +08:00			`self._layouts_rec(zoomin)`
change callback strategy, add timezone to docker (#96) 2024-03-05 12:08:41 +08:00			`callback(0.67, "Layout analysis finished")`
enlarge docker memory usage (#501) ### What problem does this PR solve? ### Type of change - [x] Refactoring 2024-04-23 14:41:10 +08:00			`print("layouts:", timer() - start)`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`self._table_transformer_job(zoomin)`
			`callback(0.68, "Table analysis finished")`
			`self._text_merge()`
change callback strategy, add timezone to docker (#96) 2024-03-05 12:08:41 +08:00			`tbls = self._extract_table_figure(True, zoomin, True, True)`
			`self._naive_vertical_merge()`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`self._filter_forpages()`
			`self._merge_with_same_bullet()`
			`callback(0.75, "Text merging finished.")`

			`callback(0.8, "Text extraction finished")`

apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))`
			`for b in self.boxes], tbls`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00

apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`def chunk(filename, binary=None, from_page=0, to_page=100000,`
			`lang="Chinese", callback=None, **kwargs):`
Refine resume parts and fix bugs in retrival using sql (#66) 2024-02-19 19:22:17 +08:00			`"""`
			`Supported file formats are docx, pdf, txt.`
			`Since a book is long and not all the parts are useful, if it's a PDF,`
			`please setup the page ranges for every book in order eliminate negative effects and save elapsed computing time.`
			`"""`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`doc = {`
			`"docnm_kwd": filename,`
refine code (#595) ### What problem does this PR solve? ### Type of change - [x] Refactoring 2024-04-28 19:13:33 +08:00			`"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`}`
refine code (#595) ### What problem does this PR solve? ### Type of change - [x] Refactoring 2024-04-28 19:13:33 +08:00			`doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`pdf_parser = None`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`sections, tbls = [], []`
remove doc from supported processing types (#488) ### What problem does this PR solve? #474 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2024-04-22 15:46:09 +08:00			`if re.search(r"\.docx$", filename, re.IGNORECASE):`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`callback(0.1, "Start to parse.")`
use onnx models, new deepdoc (#68) 2024-02-21 16:32:38 +08:00			`doc_parser = DocxParser()`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`# TODO: table of contents need to be removed`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`sections, tbls = doc_parser(`
			`binary if binary else filename, from_page=from_page, to_page=to_page)`
			`remove_contents_table(sections, eng=is_english(`
			`random_choices([t for t, _ in sections], k=200)))`
fix bug of table in docx (#510) ### What problem does this PR solve? #509 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2024-04-23 19:10:33 +08:00			`tbls = [((None, lns), None) for lns in tbls]`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`callback(0.8, "Finish parsing.")`
add use layout or not option (#145) * add use layout or not option * trival 2024-03-22 19:21:09 +08:00
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`elif re.search(r"\.pdf$", filename, re.IGNORECASE):`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`pdf_parser = Pdf() if kwargs.get(`
			`"parser_config", {}).get(`
			`"layout_recognize", True) else PlainParser()`
fix table desc bugs, add positions to chunks (#91) 2024-03-04 14:42:26 +08:00			`sections, tbls = pdf_parser(filename if not binary else binary,`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`from_page=from_page, to_page=to_page, callback=callback)`
add use layout or not option (#145) * add use layout or not option * trival 2024-03-22 19:21:09 +08:00
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`elif re.search(r"\.txt$", filename, re.IGNORECASE):`
			`callback(0.1, "Start to parse.")`
			`txt = ""`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`if binary:`
Fit a lot of encodings for text file. (#458) ### What problem does this PR solve? #384 ### Type of change - [x] Performance Improvement 2024-04-19 18:02:53 +08:00			`encoding = find_codec(binary)`
refine text decode (#657) ### What problem does this PR solve? #651 ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2024-05-07 12:25:47 +08:00			`txt = binary.decode(encoding, errors="ignore")`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`else:`
			`with open(filename, "r") as f:`
			`while True:`
			`l = f.readline()`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`if not l:`
			`break`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`txt += l`
remove unused codes, seperate layout detection out as a new api. Add new rag methed 'table' (#55) 2024-02-05 18:08:17 +08:00			`sections = txt.split("\n")`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`sections = [(l, "") for l in sections if l]`
			`remove_contents_table(sections, eng=is_english(`
			`random_choices([t for t, _ in sections], k=200)))`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00			`callback(0.8, "Finish parsing.")`
add use layout or not option (#145) * add use layout or not option * trival 2024-03-22 19:21:09 +08:00
Add support for HTML file (#973) ### What problem does this PR solve? Add support for HTML file ### Type of change - [x] New Feature (non-breaking change which adds functionality) 2024-05-30 09:12:55 +08:00			`elif re.search(r"\.(htm\|html)$", filename, re.IGNORECASE):`
			`callback(0.1, "Start to parse.")`
			`sections = HtmlParser()(filename, binary)`
			`sections = [(l, "") for l in sections if l]`
			`remove_contents_table(sections, eng=is_english(`
			`random_choices([t for t, _ in sections], k=200)))`
			`callback(0.8, "Finish parsing.")`

Add `.doc` file parser. (#497) ### What problem does this PR solve? Add `.doc` file parser, using tika. ``` pip install tika ``` ``` from tika import parser from io import BytesIO def extract_text_from_doc_bytes(doc_bytes): file_like_object = BytesIO(doc_bytes) parsed = parser.from_buffer(file_like_object) return parsed["content"] ``` ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: chrysanthemum-boy <fannc@qq.com> 2024-04-23 15:31:43 +08:00			`elif re.search(r"\.doc$", filename, re.IGNORECASE):`
			`callback(0.1, "Start to parse.")`
			`binary = BytesIO(binary)`
			`doc_parsed = parser.from_buffer(binary)`
			`sections = doc_parsed['content'].split('\n')`
			`sections = [(l, "") for l in sections if l]`
			`remove_contents_table(sections, eng=is_english(`
			`random_choices([t for t, _ in sections], k=200)))`
			`callback(0.8, "Finish parsing.")`

apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`else:`
			`raise NotImplementedError(`
Add `.doc` file parser. (#497) ### What problem does this PR solve? Add `.doc` file parser, using tika. ``` pip install tika ``` ``` from tika import parser from io import BytesIO def extract_text_from_doc_bytes(doc_bytes): file_like_object = BytesIO(doc_bytes) parsed = parser.from_buffer(file_like_object) return parsed["content"] ``` ### Type of change - [x] New Feature (non-breaking change which adds functionality) --------- Co-authored-by: chrysanthemum-boy <fannc@qq.com> 2024-04-23 15:31:43 +08:00			`"file type not supported yet(doc, docx, pdf, txt supported)")`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00
Some document API refined. (#53) Add naive chunking method to RAG 2024-02-02 19:21:37 +08:00			`make_colon_as_title(sections)`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`bull = bullets_category(`
			`[t for t in random_choices([t for t, _ in sections], k=100)])`
add use layout or not option (#145) * add use layout or not option * trival 2024-03-22 19:21:09 +08:00			`if bull >= 0:`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`chunks = ["\n".join(ck)`
refine log format (#312) ### What problem does this PR solve? Issue link:#264 ### Type of change - [x] Documentation Update - [x] Refactoring 2024-04-11 10:13:43 +08:00			`for ck in hierarchical_merge(bull, sections, 5)]`
solve task execution issues (#90) 2024-03-01 19:48:01 +08:00			`else:`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`sections = [s.split("@") for s, _ in sections]`
fix create dialog bug (#982) ### What problem does this PR solve? ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) 2024-05-30 09:25:05 +08:00			`sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections ]`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`chunks = naive_merge(`
			`sections, kwargs.get(`
			`"chunk_token_num", 256), kwargs.get(`
			`"delimer", "\n。；！？"))`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00
			`# is it English`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00			`# is_english(random_choices([t for t, _ in sections], k=218))`
			`eng = lang.lower() == "english"`
solve task execution issues (#90) 2024-03-01 19:48:01 +08:00
			`res = tokenize_table(tbls, doc, eng)`
add use layout or not option (#145) * add use layout or not option * trival 2024-03-22 19:21:09 +08:00			`res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))`
Add Q&A and Book, fix task running bugs (#50) 2024-02-01 18:53:56 +08:00
			`return res`


			`if __name__ == "__main__":`
			`import sys`
apply pep8 formalize (#155) 2024-03-27 11:33:46 +08:00
change callback strategy, add timezone to docker (#96) 2024-03-05 12:08:41 +08:00			`def dummy(prog=None, msg=""):`
Some document API refined. (#53) Add naive chunking method to RAG 2024-02-02 19:21:37 +08:00			`pass`
			`chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)`