2024-02-07 19:27:23 +08:00
import copy
import json
import os
import re
import requests
from api . db . services . knowledgebase_service import KnowledgebaseService
2024-02-19 19:22:17 +08:00
from api . settings import stat_logger
2024-02-07 19:27:23 +08:00
from rag . nlp import huqie
from rag . settings import cron_logger
from rag . utils import rmSpace
2024-02-19 19:22:17 +08:00
forbidden_select_fields4resume = [
" name_pinyin_kwd " , " edu_first_fea_kwd " , " degree_kwd " , " sch_rank_kwd " , " edu_fea_kwd "
]
2024-02-07 19:27:23 +08:00
def chunk ( filename , binary = None , callback = None , * * kwargs ) :
2024-02-19 19:22:17 +08:00
"""
The supported file formats are pdf , docx and txt .
To maximize the effectiveness , parse the resume correctly ,
please visit https : / / github . com / infiniflow / ragflow , and sign in the our demo web - site
to get token . It ' s FREE!
Set INFINIFLOW_SERVER and INFINIFLOW_TOKEN in ' .env ' file or
using ' export ' to set both environment variables : INFINIFLOW_SERVER and INFINIFLOW_TOKEN in docker container .
"""
2024-02-08 17:01:01 +08:00
if not re . search ( r " \ .(pdf|doc|docx|txt)$ " , filename , flags = re . IGNORECASE ) :
raise NotImplementedError ( " file type not supported yet(pdf supported) " )
2024-02-07 19:27:23 +08:00
url = os . environ . get ( " INFINIFLOW_SERVER " )
token = os . environ . get ( " INFINIFLOW_TOKEN " )
2024-02-19 19:22:17 +08:00
if not url or not token :
stat_logger . warning (
" INFINIFLOW_SERVER is not specified. To maximize the effectiveness, please visit https://github.com/infiniflow/ragflow, and sign in the our demo web site to get token. It ' s FREE! Using ' export ' to set both environment variables: INFINIFLOW_SERVER and INFINIFLOW_TOKEN. " )
return [ ]
2024-02-07 19:27:23 +08:00
if not binary :
2024-02-08 17:01:01 +08:00
with open ( filename , " rb " ) as f :
binary = f . read ( )
2024-02-07 19:27:23 +08:00
def remote_call ( ) :
nonlocal filename , binary
for _ in range ( 3 ) :
try :
res = requests . post ( url + " /v1/layout/resume/ " , files = [ ( filename , binary ) ] ,
headers = { " Authorization " : token } , timeout = 180 )
res = res . json ( )
2024-02-08 17:01:01 +08:00
if res [ " retcode " ] != 0 :
raise RuntimeError ( res [ " retmsg " ] )
2024-02-07 19:27:23 +08:00
return res [ " data " ]
except RuntimeError as e :
raise e
except Exception as e :
cron_logger . error ( " resume parsing: " + str ( e ) )
2024-02-08 17:01:01 +08:00
callback ( 0.2 , " Resume parsing is going on... " )
2024-02-07 19:27:23 +08:00
resume = remote_call ( )
2024-02-19 19:22:17 +08:00
if len ( resume . keys ( ) ) < 7 :
callback ( - 1 , " Resume is not successfully parsed. " )
return [ ]
2024-02-08 17:01:01 +08:00
callback ( 0.6 , " Done parsing. Chunking... " )
2024-02-07 19:27:23 +08:00
print ( json . dumps ( resume , ensure_ascii = False , indent = 2 ) )
field_map = {
" name_kwd " : " 姓名/名字 " ,
2024-02-19 19:22:17 +08:00
" name_pinyin_kwd " : " 姓名拼音/名字拼音 " ,
2024-02-07 19:27:23 +08:00
" gender_kwd " : " 性别(男,女) " ,
" age_int " : " 年龄/岁/年纪 " ,
" phone_kwd " : " 电话/手机/微信 " ,
" email_tks " : " email/e-mail/邮箱 " ,
" position_name_tks " : " 职位/职能/岗位/职责 " ,
2024-02-19 19:22:17 +08:00
" expect_city_names_tks " : " 期望城市 " ,
" work_exp_flt " : " 工作年限/工作年份/N年经验/毕业了多少年 " ,
" corporation_name_tks " : " 最近就职(上班)的公司/上一家公司 " ,
2024-02-08 17:01:01 +08:00
2024-02-19 19:22:17 +08:00
" first_school_name_tks " : " 第一学历毕业学校 " ,
2024-02-07 19:27:23 +08:00
" first_degree_kwd " : " 第一学历( 高中, 职高, 硕士, 本科, 博士, 初中, 中技, 中专, 专科, 专升本, MPA, MBA, EMBA) " ,
2024-02-19 19:22:17 +08:00
" highest_degree_kwd " : " 最高学历( 高中, 职高, 硕士, 本科, 博士, 初中, 中技, 中专, 专科, 专升本, MPA, MBA, EMBA) " ,
2024-02-07 19:27:23 +08:00
" first_major_tks " : " 第一学历专业 " ,
" edu_first_fea_kwd " : " 第一学历标签( 211, 留学, 双一流, 985, 海外知名, 重点大学, 中专, 专升本, 专科, 本科, 大专) " ,
2024-02-08 17:01:01 +08:00
2024-02-07 19:27:23 +08:00
" degree_kwd " : " 过往学历( 高中, 职高, 硕士, 本科, 博士, 初中, 中技, 中专, 专科, 专升本, MPA, MBA, EMBA) " ,
" major_tks " : " 学过的专业/过往专业 " ,
" school_name_tks " : " 学校/毕业院校 " ,
" sch_rank_kwd " : " 学校标签(顶尖学校,精英学校,优质学校,一般学校) " ,
" edu_fea_kwd " : " 教育标签( 211, 留学, 双一流, 985, 海外知名, 重点大学, 中专, 专升本, 专科, 本科, 大专) " ,
2024-02-08 17:01:01 +08:00
2024-02-07 19:27:23 +08:00
" corp_nm_tks " : " 就职过的公司/之前的公司/上过班的公司 " ,
" edu_end_int " : " 毕业年份 " ,
2024-02-19 19:22:17 +08:00
" industry_name_tks " : " 所在行业 " ,
" birth_dt " : " 生日/出生年份 " ,
" expect_position_name_tks " : " 期望职位/期望职能/期望岗位 " ,
2024-02-07 19:27:23 +08:00
}
2024-02-19 19:22:17 +08:00
2024-02-07 19:27:23 +08:00
titles = [ ]
for n in [ " name_kwd " , " gender_kwd " , " position_name_tks " , " age_int " ] :
v = resume . get ( n , " " )
2024-02-08 17:01:01 +08:00
if isinstance ( v , list ) :
v = v [ 0 ]
if n . find ( " tks " ) > 0 :
v = rmSpace ( v )
2024-02-07 19:27:23 +08:00
titles . append ( str ( v ) )
doc = {
" docnm_kwd " : filename ,
2024-02-08 17:01:01 +08:00
" title_tks " : huqie . qie ( " - " . join ( titles ) + " -简历 " )
2024-02-07 19:27:23 +08:00
}
doc [ " title_sm_tks " ] = huqie . qieqie ( doc [ " title_tks " ] )
pairs = [ ]
2024-02-08 17:01:01 +08:00
for n , m in field_map . items ( ) :
if not resume . get ( n ) :
continue
2024-02-07 19:27:23 +08:00
v = resume [ n ]
2024-02-08 17:01:01 +08:00
if isinstance ( v , list ) :
v = " " . join ( v )
if n . find ( " tks " ) > 0 :
v = rmSpace ( v )
2024-02-07 19:27:23 +08:00
pairs . append ( ( m , str ( v ) ) )
2024-02-08 17:01:01 +08:00
doc [ " content_with_weight " ] = " \n " . join (
[ " {} : {} " . format ( re . sub ( r " ( [^( ) ]+) " , " " , k ) , v ) for k , v in pairs ] )
2024-02-07 19:27:23 +08:00
doc [ " content_ltks " ] = huqie . qie ( doc [ " content_with_weight " ] )
doc [ " content_sm_ltks " ] = huqie . qieqie ( doc [ " content_ltks " ] )
2024-02-08 17:01:01 +08:00
for n , _ in field_map . items ( ) :
2024-02-19 19:22:17 +08:00
if n not in resume : continue
if isinstance ( resume [ n ] , list ) and ( len ( resume [ n ] ) == 1 or n not in forbidden_select_fields4resume ) :
resume [ n ] = resume [ n ] [ 0 ]
if n . find ( " _tks " ) > 0 : resume [ n ] = huqie . qieqie ( resume [ n ] )
2024-02-08 17:01:01 +08:00
doc [ n ] = resume [ n ]
2024-02-07 19:27:23 +08:00
print ( doc )
2024-02-08 17:01:01 +08:00
KnowledgebaseService . update_parser_config (
kwargs [ " kb_id " ] , { " field_map " : field_map } )
2024-02-07 19:27:23 +08:00
return [ doc ]
if __name__ == " __main__ " :
import sys
2024-02-08 17:01:01 +08:00
2024-02-07 19:27:23 +08:00
def dummy ( a , b ) :
pass
chunk ( sys . argv [ 1 ] , callback = dummy )