mirror of
				https://github.com/infiniflow/ragflow.git
				synced 2025-11-04 11:49:37 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			145 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			145 lines
		
	
	
		
			5.3 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
#  Licensed under the Apache License, Version 2.0 (the "License");
 | 
						||
#  you may not use this file except in compliance with the License.
 | 
						||
#  You may obtain a copy of the License at
 | 
						||
#
 | 
						||
#      http://www.apache.org/licenses/LICENSE-2.0
 | 
						||
#
 | 
						||
#  Unless required by applicable law or agreed to in writing, software
 | 
						||
#  distributed under the License is distributed on an "AS IS" BASIS,
 | 
						||
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | 
						||
#  See the License for the specific language governing permissions and
 | 
						||
#  limitations under the License.
 | 
						||
#
 | 
						||
import re
 | 
						||
from copy import deepcopy
 | 
						||
from io import BytesIO
 | 
						||
from nltk import word_tokenize
 | 
						||
from openpyxl import load_workbook
 | 
						||
from rag.nlp import is_english, random_choices
 | 
						||
from rag.nlp import huqie
 | 
						||
from deepdoc.parser import ExcelParser
 | 
						||
 | 
						||
 | 
						||
class Excel(ExcelParser):
 | 
						||
    def __call__(self, fnm, binary=None, callback=None):
 | 
						||
        if not binary:
 | 
						||
            wb = load_workbook(fnm)
 | 
						||
        else:
 | 
						||
            wb = load_workbook(BytesIO(binary))
 | 
						||
        total = 0
 | 
						||
        for sheetname in wb.sheetnames:
 | 
						||
            total += len(list(wb[sheetname].rows))
 | 
						||
 | 
						||
        res, fails = [], []
 | 
						||
        for sheetname in wb.sheetnames:
 | 
						||
            ws = wb[sheetname]
 | 
						||
            rows = list(ws.rows)
 | 
						||
            for i, r in enumerate(rows):
 | 
						||
                q, a = "", ""
 | 
						||
                for cell in r:
 | 
						||
                    if not cell.value:
 | 
						||
                        continue
 | 
						||
                    if not q:
 | 
						||
                        q = str(cell.value)
 | 
						||
                    elif not a:
 | 
						||
                        a = str(cell.value)
 | 
						||
                    else:
 | 
						||
                        break
 | 
						||
                if q and a:
 | 
						||
                    res.append((q, a))
 | 
						||
                else:
 | 
						||
                    fails.append(str(i + 1))
 | 
						||
                if len(res) % 999 == 0:
 | 
						||
                    callback(len(res) *
 | 
						||
                             0.6 /
 | 
						||
                             total, ("Extract Q&A: {}".format(len(res)) +
 | 
						||
                                     (f"{len(fails)} failure, line: %s..." %
 | 
						||
                                      (",".join(fails[:3])) if fails else "")))
 | 
						||
 | 
						||
        callback(0.6, ("Extract Q&A: {}. ".format(len(res)) + (
 | 
						||
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 | 
						||
        self.is_english = is_english(
 | 
						||
            [rmPrefix(q) for q, _ in random_choices(res, k=30) if len(q) > 1])
 | 
						||
        return res
 | 
						||
 | 
						||
 | 
						||
def rmPrefix(txt):
 | 
						||
    return re.sub(
 | 
						||
        r"^(问题|答案|回答|user|assistant|Q|A|Question|Answer|问|答)[\t:: ]+", "", txt.strip(), flags=re.IGNORECASE)
 | 
						||
 | 
						||
 | 
						||
def beAdoc(d, q, a, eng):
 | 
						||
    qprefix = "Question: " if eng else "问题:"
 | 
						||
    aprefix = "Answer: " if eng else "回答:"
 | 
						||
    d["content_with_weight"] = "\t".join(
 | 
						||
        [qprefix + rmPrefix(q), aprefix + rmPrefix(a)])
 | 
						||
    d["content_ltks"] = huqie.qie(q)
 | 
						||
    d["content_sm_ltks"] = huqie.qieqie(d["content_ltks"])
 | 
						||
    return d
 | 
						||
 | 
						||
 | 
						||
def chunk(filename, binary=None, lang="Chinese", callback=None, **kwargs):
 | 
						||
    """
 | 
						||
        Excel and csv(txt) format files are supported.
 | 
						||
        If the file is in excel format, there should be 2 column question and answer without header.
 | 
						||
        And question column is ahead of answer column.
 | 
						||
        And it's O.K if it has multiple sheets as long as the columns are rightly composed.
 | 
						||
 | 
						||
        If it's in csv format, it should be UTF-8 encoded. Use TAB as delimiter to separate question and answer.
 | 
						||
 | 
						||
        All the deformed lines will be ignored.
 | 
						||
        Every pair of Q&A will be treated as a chunk.
 | 
						||
    """
 | 
						||
    eng = lang.lower() == "english"
 | 
						||
    res = []
 | 
						||
    doc = {
 | 
						||
        "docnm_kwd": filename,
 | 
						||
        "title_tks": huqie.qie(re.sub(r"\.[a-zA-Z]+$", "", filename))
 | 
						||
    }
 | 
						||
    if re.search(r"\.xlsx?$", filename, re.IGNORECASE):
 | 
						||
        callback(0.1, "Start to parse.")
 | 
						||
        excel_parser = Excel()
 | 
						||
        for q, a in excel_parser(filename, binary, callback):
 | 
						||
            res.append(beAdoc(deepcopy(doc), q, a, eng))
 | 
						||
        return res
 | 
						||
    elif re.search(r"\.(txt|csv)$", filename, re.IGNORECASE):
 | 
						||
        callback(0.1, "Start to parse.")
 | 
						||
        txt = ""
 | 
						||
        if binary:
 | 
						||
            txt = binary.decode("utf-8")
 | 
						||
        else:
 | 
						||
            with open(filename, "r") as f:
 | 
						||
                while True:
 | 
						||
                    l = f.readline()
 | 
						||
                    if not l:
 | 
						||
                        break
 | 
						||
                    txt += l
 | 
						||
        lines = txt.split("\n")
 | 
						||
        #is_english([rmPrefix(l) for l in lines[:100]])
 | 
						||
        fails = []
 | 
						||
        for i, line in enumerate(lines):
 | 
						||
            arr = [l for l in line.split("\t") if len(l) > 1]
 | 
						||
            if len(arr) != 2:
 | 
						||
                fails.append(str(i))
 | 
						||
                continue
 | 
						||
            res.append(beAdoc(deepcopy(doc), arr[0], arr[1], eng))
 | 
						||
            if len(res) % 999 == 0:
 | 
						||
                callback(len(res) * 0.6 / len(lines), ("Extract Q&A: {}".format(len(res)) + (
 | 
						||
                    f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 | 
						||
 | 
						||
        callback(0.6, ("Extract Q&A: {}".format(len(res)) + (
 | 
						||
            f"{len(fails)} failure, line: %s..." % (",".join(fails[:3])) if fails else "")))
 | 
						||
 | 
						||
        return res
 | 
						||
 | 
						||
    raise NotImplementedError(
 | 
						||
        "file type not supported yet(pptx, pdf supported)")
 | 
						||
 | 
						||
 | 
						||
if __name__ == "__main__":
 | 
						||
    import sys
 | 
						||
 | 
						||
    def dummy(a, b):
 | 
						||
        pass
 | 
						||
    chunk(sys.argv[1], callback=dummy)
 |