mirror of
				https://github.com/infiniflow/ragflow.git
				synced 2025-10-26 15:29:09 +00:00 
			
		
		
		
	 cdea1d0a85
			
		
	
	
		cdea1d0a85
		
			
		
	
	
	
	
		
			
			### What problem does this PR solve? - Update readme - Add license ### Type of change - [x] Documentation Update --------- Signed-off-by: Jin Hai <haijin.chn@gmail.com>
		
			
				
	
	
		
			187 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
			
		
		
	
	
			187 lines
		
	
	
		
			6.2 KiB
		
	
	
	
		
			Python
		
	
	
	
	
	
| #  Licensed under the Apache License, Version 2.0 (the "License");
 | |
| #  you may not use this file except in compliance with the License.
 | |
| #  You may obtain a copy of the License at
 | |
| #
 | |
| #      http://www.apache.org/licenses/LICENSE-2.0
 | |
| #
 | |
| #  Unless required by applicable law or agreed to in writing, software
 | |
| #  distributed under the License is distributed on an "AS IS" BASIS,
 | |
| #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 | |
| #  See the License for the specific language governing permissions and
 | |
| #  limitations under the License.
 | |
| #
 | |
| 
 | |
| import json
 | |
| from deepdoc.parser.resume.entities import degrees, regions, industries
 | |
| 
 | |
| FIELDS = [
 | |
| "address STRING",
 | |
| "annual_salary int",
 | |
| "annual_salary_from int",
 | |
| "annual_salary_to int",
 | |
| "birth STRING",
 | |
| "card STRING",
 | |
| "certificate_obj string",
 | |
| "city STRING",
 | |
| "corporation_id int",
 | |
| "corporation_name STRING",
 | |
| "corporation_type STRING",
 | |
| "degree STRING",
 | |
| "discipline_name STRING",
 | |
| "education_obj string",
 | |
| "email STRING",
 | |
| "expect_annual_salary int",
 | |
| "expect_city_names string",
 | |
| "expect_industry_name STRING",
 | |
| "expect_position_name STRING",
 | |
| "expect_salary_from int",
 | |
| "expect_salary_to int",
 | |
| "expect_type STRING",
 | |
| "gender STRING",
 | |
| "industry_name STRING",
 | |
| "industry_names STRING",
 | |
| "is_deleted STRING",
 | |
| "is_fertility STRING",
 | |
| "is_house STRING",
 | |
| "is_management_experience STRING",
 | |
| "is_marital STRING",
 | |
| "is_oversea STRING",
 | |
| "language_obj string",
 | |
| "name STRING",
 | |
| "nation STRING",
 | |
| "phone STRING",
 | |
| "political_status STRING",
 | |
| "position_name STRING",
 | |
| "project_obj string",
 | |
| "responsibilities string",
 | |
| "salary_month int",
 | |
| "scale STRING",
 | |
| "school_name STRING",
 | |
| "self_remark string",
 | |
| "skill_obj string",
 | |
| "title_name STRING",
 | |
| "tob_resume_id STRING",
 | |
| "updated_at Timestamp",
 | |
| "wechat STRING",
 | |
| "work_obj string",
 | |
| "work_experience int",
 | |
| "work_start_time BIGINT"
 | |
| ]
 | |
| 
 | |
| def refactor(df):
 | |
|     def deal_obj(obj, k, kk):
 | |
|         if not isinstance(obj, type({})):
 | |
|             return ""
 | |
|         obj = obj.get(k, {})
 | |
|         if not isinstance(obj, type({})):
 | |
|             return ""
 | |
|         return obj.get(kk, "")
 | |
| 
 | |
|     def loadjson(line):
 | |
|         try:
 | |
|             return json.loads(line)
 | |
|         except Exception as e:
 | |
|             pass
 | |
|         return {}
 | |
| 
 | |
|     df["obj"] = df["resume_content"].map(lambda x: loadjson(x))
 | |
|     df.fillna("", inplace=True)
 | |
| 
 | |
|     clms = ["tob_resume_id", "updated_at"]
 | |
| 
 | |
|     def extract(nms, cc=None):
 | |
|         nonlocal clms
 | |
|         clms.extend(nms)
 | |
|         for c in nms:
 | |
|             if cc:
 | |
|                 df[c] = df["obj"].map(lambda x: deal_obj(x, cc, c))
 | |
|             else:
 | |
|                 df[c] = df["obj"].map(
 | |
|                     lambda x: json.dumps(
 | |
|                         x.get(
 | |
|                             c,
 | |
|                             {}),
 | |
|                         ensure_ascii=False) if isinstance(
 | |
|                         x,
 | |
|                         type(
 | |
|                             {})) and (
 | |
|                         isinstance(
 | |
|                             x.get(c),
 | |
|                             type(
 | |
|                                 {})) or not x.get(c)) else str(x).replace(
 | |
|                                     "None",
 | |
|                         ""))
 | |
| 
 | |
|     extract(["education", "work", "certificate", "project", "language",
 | |
|              "skill"])
 | |
|     extract(["wechat", "phone", "is_deleted",
 | |
|             "name", "tel", "email"], "contact")
 | |
|     extract(["nation", "expect_industry_name", "salary_month",
 | |
|              "industry_ids", "is_house", "birth", "annual_salary_from",
 | |
|              "annual_salary_to", "card",
 | |
|              "expect_salary_to", "expect_salary_from",
 | |
|              "expect_position_name", "gender", "city",
 | |
|              "is_fertility", "expect_city_names",
 | |
|              "political_status", "title_name", "expect_annual_salary",
 | |
|              "industry_name", "address", "position_name", "school_name",
 | |
|              "corporation_id",
 | |
|              "is_oversea", "responsibilities",
 | |
|              "work_start_time", "degree", "management_experience",
 | |
|              "expect_type", "corporation_type", "scale", "corporation_name",
 | |
|              "self_remark", "annual_salary", "work_experience",
 | |
|              "discipline_name", "marital", "updated_at"], "basic")
 | |
| 
 | |
|     df["degree"] = df["degree"].map(lambda x: degrees.get_name(x))
 | |
|     df["address"] = df["address"].map(lambda x: " ".join(regions.get_names(x)))
 | |
|     df["industry_names"] = df["industry_ids"].map(lambda x: " ".join([" ".join(industries.get_names(i)) for i in
 | |
|                                                                       str(x).split(",")]))
 | |
|     clms.append("industry_names")
 | |
| 
 | |
|     def arr2str(a):
 | |
|         if not a:
 | |
|             return ""
 | |
|         if isinstance(a, list):
 | |
|             a = " ".join([str(i) for i in a])
 | |
|         return str(a).replace(",", " ")
 | |
| 
 | |
|     df["expect_industry_name"] = df["expect_industry_name"].map(
 | |
|         lambda x: arr2str(x))
 | |
|     df["gender"] = df["gender"].map(
 | |
|         lambda x: "男" if x == 'M' else (
 | |
|             "女" if x == 'F' else ""))
 | |
|     for c in ["is_fertility", "is_oversea", "is_house",
 | |
|               "management_experience", "marital"]:
 | |
|         df[c] = df[c].map(
 | |
|             lambda x: '是' if x == 'Y' else (
 | |
|                 '否' if x == 'N' else ""))
 | |
|     df["is_management_experience"] = df["management_experience"]
 | |
|     df["is_marital"] = df["marital"]
 | |
|     clms.extend(["is_management_experience", "is_marital"])
 | |
| 
 | |
|     df.fillna("", inplace=True)
 | |
|     for i in range(len(df)):
 | |
|         if not df.loc[i, "phone"].strip() and df.loc[i, "tel"].strip():
 | |
|             df.loc[i, "phone"] = df.loc[i, "tel"].strip()
 | |
| 
 | |
|     for n in ["industry_ids", "management_experience", "marital", "tel"]:
 | |
|         for i in range(len(clms)):
 | |
|             if clms[i] == n:
 | |
|                 del clms[i]
 | |
|                 break
 | |
| 
 | |
|     clms = list(set(clms))
 | |
| 
 | |
|     df = df.reindex(sorted(clms), axis=1)
 | |
|     #print(json.dumps(list(df.columns.values)), "LLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLLL")
 | |
|     for c in clms:
 | |
|         df[c] = df[c].map(
 | |
|             lambda s: str(s).replace(
 | |
|                 "\t",
 | |
|                 " ").replace(
 | |
|                 "\n",
 | |
|                 "\\n").replace(
 | |
|                 "\r",
 | |
|                 "\\n"))
 | |
|     # print(df.values.tolist())
 | |
|     return dict(zip([n.split(" ")[0] for n in FIELDS], df.values.tolist()[0]))
 |