| 
									
										
										
										
											2024-06-01 16:24:10 +08:00
										 |  |  |  | #  Licensed under the Apache License, Version 2.0 (the "License"); | 
					
						
							|  |  |  |  | #  you may not use this file except in compliance with the License. | 
					
						
							|  |  |  |  | #  You may obtain a copy of the License at | 
					
						
							|  |  |  |  | # | 
					
						
							|  |  |  |  | #      http://www.apache.org/licenses/LICENSE-2.0 | 
					
						
							|  |  |  |  | # | 
					
						
							|  |  |  |  | #  Unless required by applicable law or agreed to in writing, software | 
					
						
							|  |  |  |  | #  distributed under the License is distributed on an "AS IS" BASIS, | 
					
						
							|  |  |  |  | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
					
						
							|  |  |  |  | #  See the License for the specific language governing permissions and | 
					
						
							|  |  |  |  | #  limitations under the License. | 
					
						
							|  |  |  |  | # | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-16 12:29:58 +08:00
										 |  |  |  | import re, copy, time, datetime, demjson3, \ | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  |     traceback, signal | 
					
						
							|  |  |  |  | import numpy as np | 
					
						
							|  |  |  |  | from deepdoc.parser.resume.entities import degrees, schools, corporations | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  | from rag.nlp import rag_tokenizer, surname | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | from xpinyin import Pinyin | 
					
						
							|  |  |  |  | from contextlib import contextmanager | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | class TimeoutException(Exception): pass | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | @contextmanager | 
					
						
							|  |  |  |  | def time_limit(seconds): | 
					
						
							|  |  |  |  |     def signal_handler(signum, frame): | 
					
						
							|  |  |  |  |         raise TimeoutException("Timed out!") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     signal.signal(signal.SIGALRM, signal_handler) | 
					
						
							|  |  |  |  |     signal.alarm(seconds) | 
					
						
							|  |  |  |  |     try: | 
					
						
							|  |  |  |  |         yield | 
					
						
							|  |  |  |  |     finally: | 
					
						
							|  |  |  |  |         signal.alarm(0) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | ENV = None | 
					
						
							|  |  |  |  | PY = Pinyin() | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def rmHtmlTag(line): | 
					
						
							|  |  |  |  |     return re.sub(r"<[a-z0-9.\"=';,:\+_/ -]+>", " ", line, 100000, re.IGNORECASE) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def highest_degree(dg): | 
					
						
							|  |  |  |  |     if not dg: return "" | 
					
						
							|  |  |  |  |     if type(dg) == type(""): dg = [dg] | 
					
						
							|  |  |  |  |     m = {"初中": 0, "高中": 1, "中专": 2, "大专": 3, "专升本": 4, "本科": 5, "硕士": 6, "博士": 7, "博士后": 8} | 
					
						
							|  |  |  |  |     return sorted([(d, m.get(d, -1)) for d in dg], key=lambda x: x[1] * -1)[0][0] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def forEdu(cv): | 
					
						
							|  |  |  |  |     if not cv.get("education_obj"): | 
					
						
							|  |  |  |  |         cv["integerity_flt"] *= 0.8 | 
					
						
							|  |  |  |  |         return cv | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     first_fea, fea, maj, fmaj, deg, fdeg, sch, fsch, st_dt, ed_dt = [], [], [], [], [], [], [], [], [], [] | 
					
						
							|  |  |  |  |     edu_nst = [] | 
					
						
							|  |  |  |  |     edu_end_dt = "" | 
					
						
							|  |  |  |  |     cv["school_rank_int"] = 1000000 | 
					
						
							|  |  |  |  |     for ii, n in enumerate(sorted(cv["education_obj"], key=lambda x: x.get("start_time", "3"))): | 
					
						
							|  |  |  |  |         e = {} | 
					
						
							|  |  |  |  |         if n.get("end_time"): | 
					
						
							|  |  |  |  |             if n["end_time"] > edu_end_dt: edu_end_dt = n["end_time"] | 
					
						
							|  |  |  |  |             try: | 
					
						
							|  |  |  |  |                 dt = n["end_time"] | 
					
						
							|  |  |  |  |                 if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt) | 
					
						
							|  |  |  |  |                 y, m, d = getYMD(dt) | 
					
						
							|  |  |  |  |                 ed_dt.append(str(y)) | 
					
						
							|  |  |  |  |                 e["end_dt_kwd"] = str(y) | 
					
						
							|  |  |  |  |             except Exception as e: | 
					
						
							|  |  |  |  |                 pass | 
					
						
							|  |  |  |  |         if n.get("start_time"): | 
					
						
							|  |  |  |  |             try: | 
					
						
							|  |  |  |  |                 dt = n["start_time"] | 
					
						
							|  |  |  |  |                 if re.match(r"[0-9]{9,}", dt): dt = turnTm2Dt(dt) | 
					
						
							|  |  |  |  |                 y, m, d = getYMD(dt) | 
					
						
							|  |  |  |  |                 st_dt.append(str(y)) | 
					
						
							|  |  |  |  |                 e["start_dt_kwd"] = str(y) | 
					
						
							|  |  |  |  |             except Exception as e: | 
					
						
							|  |  |  |  |                 pass | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         r = schools.select(n.get("school_name", "")) | 
					
						
							|  |  |  |  |         if r: | 
					
						
							|  |  |  |  |             if str(r.get("type", "")) == "1": fea.append("211") | 
					
						
							|  |  |  |  |             if str(r.get("type", "")) == "2": fea.append("211") | 
					
						
							|  |  |  |  |             if str(r.get("is_abroad", "")) == "1": fea.append("留学") | 
					
						
							|  |  |  |  |             if str(r.get("is_double_first", "")) == "1": fea.append("双一流") | 
					
						
							|  |  |  |  |             if str(r.get("is_985", "")) == "1": fea.append("985") | 
					
						
							|  |  |  |  |             if str(r.get("is_world_known", "")) == "1": fea.append("海外知名") | 
					
						
							|  |  |  |  |             if r.get("rank") and cv["school_rank_int"] > r["rank"]: cv["school_rank_int"] = r["rank"] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if n.get("school_name") and isinstance(n["school_name"], str): | 
					
						
							|  |  |  |  |             sch.append(re.sub(r"(211|985|重点大学|[,&;;-])", "", n["school_name"])) | 
					
						
							|  |  |  |  |             e["sch_nm_kwd"] = sch[-1] | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |         fea.append(rag_tokenizer.fine_grained_tokenize(rag_tokenizer.tokenize(n.get("school_name", ""))).split(" ")[-1]) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |         if n.get("discipline_name") and isinstance(n["discipline_name"], str): | 
					
						
							|  |  |  |  |             maj.append(n["discipline_name"]) | 
					
						
							|  |  |  |  |             e["major_kwd"] = n["discipline_name"] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if not n.get("degree") and "985" in fea and not first_fea: n["degree"] = "1" | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if n.get("degree"): | 
					
						
							|  |  |  |  |             d = degrees.get_name(n["degree"]) | 
					
						
							|  |  |  |  |             if d: e["degree_kwd"] = d | 
					
						
							|  |  |  |  |             if d == "本科" and ("专科" in deg or "专升本" in deg or "中专" in deg or "大专" in deg or re.search(r"(成人|自考|自学考试)", | 
					
						
							|  |  |  |  |                                                                                                      n.get( | 
					
						
							|  |  |  |  |                                                                                                          "school_name", | 
					
						
							|  |  |  |  |                                                                                                          ""))): d = "专升本" | 
					
						
							|  |  |  |  |             if d: deg.append(d) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |             # for first degree | 
					
						
							|  |  |  |  |             if not fdeg and d in ["中专", "专升本", "专科", "本科", "大专"]: | 
					
						
							|  |  |  |  |                 fdeg = [d] | 
					
						
							|  |  |  |  |                 if n.get("school_name"): fsch = [n["school_name"]] | 
					
						
							|  |  |  |  |                 if n.get("discipline_name"): fmaj = [n["discipline_name"]] | 
					
						
							|  |  |  |  |                 first_fea = copy.deepcopy(fea) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         edu_nst.append(e) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     cv["sch_rank_kwd"] = [] | 
					
						
							|  |  |  |  |     if cv["school_rank_int"] <= 20 \ | 
					
						
							|  |  |  |  |             or ("海外名校" in fea and cv["school_rank_int"] <= 200): | 
					
						
							|  |  |  |  |         cv["sch_rank_kwd"].append("顶尖学校") | 
					
						
							|  |  |  |  |     elif cv["school_rank_int"] <= 50 and cv["school_rank_int"] > 20 \ | 
					
						
							|  |  |  |  |             or ("海外名校" in fea and cv["school_rank_int"] <= 500 and \ | 
					
						
							|  |  |  |  |                 cv["school_rank_int"] > 200): | 
					
						
							|  |  |  |  |         cv["sch_rank_kwd"].append("精英学校") | 
					
						
							|  |  |  |  |     elif cv["school_rank_int"] > 50 and ("985" in fea or "211" in fea) \ | 
					
						
							|  |  |  |  |             or ("海外名校" in fea and cv["school_rank_int"] > 500): | 
					
						
							|  |  |  |  |         cv["sch_rank_kwd"].append("优质学校") | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         cv["sch_rank_kwd"].append("一般学校") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if edu_nst: cv["edu_nst"] = edu_nst | 
					
						
							|  |  |  |  |     if fea: cv["edu_fea_kwd"] = list(set(fea)) | 
					
						
							|  |  |  |  |     if first_fea: cv["edu_first_fea_kwd"] = list(set(first_fea)) | 
					
						
							|  |  |  |  |     if maj: cv["major_kwd"] = maj | 
					
						
							|  |  |  |  |     if fsch: cv["first_school_name_kwd"] = fsch | 
					
						
							|  |  |  |  |     if fdeg: cv["first_degree_kwd"] = fdeg | 
					
						
							|  |  |  |  |     if fmaj: cv["first_major_kwd"] = fmaj | 
					
						
							|  |  |  |  |     if st_dt: cv["edu_start_kwd"] = st_dt | 
					
						
							|  |  |  |  |     if ed_dt: cv["edu_end_kwd"] = ed_dt | 
					
						
							|  |  |  |  |     if ed_dt: cv["edu_end_int"] = max([int(t) for t in ed_dt]) | 
					
						
							|  |  |  |  |     if deg: | 
					
						
							|  |  |  |  |         if "本科" in deg and "专科" in deg: | 
					
						
							|  |  |  |  |             deg.append("专升本") | 
					
						
							|  |  |  |  |             deg = [d for d in deg if d != '本科'] | 
					
						
							|  |  |  |  |         cv["degree_kwd"] = deg | 
					
						
							|  |  |  |  |         cv["highest_degree_kwd"] = highest_degree(deg) | 
					
						
							|  |  |  |  |     if edu_end_dt: | 
					
						
							|  |  |  |  |         try: | 
					
						
							|  |  |  |  |             if re.match(r"[0-9]{9,}", edu_end_dt): edu_end_dt = turnTm2Dt(edu_end_dt) | 
					
						
							|  |  |  |  |             if edu_end_dt.strip("\n") == "至今": edu_end_dt = cv.get("updated_at_dt", str(datetime.date.today())) | 
					
						
							|  |  |  |  |             y, m, d = getYMD(edu_end_dt) | 
					
						
							|  |  |  |  |             cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000)) | 
					
						
							|  |  |  |  |         except Exception as e: | 
					
						
							|  |  |  |  |             print("EXCEPTION: ", e, edu_end_dt, cv.get("work_exp_flt")) | 
					
						
							|  |  |  |  |     if sch: | 
					
						
							|  |  |  |  |         cv["school_name_kwd"] = sch | 
					
						
							|  |  |  |  |         if (len(cv.get("degree_kwd", [])) >= 1 and "本科" in cv["degree_kwd"]) \ | 
					
						
							|  |  |  |  |                 or all([c.lower() in ["硕士", "博士", "mba", "博士后"] for c in cv.get("degree_kwd", [])]) \ | 
					
						
							|  |  |  |  |                 or not cv.get("degree_kwd"): | 
					
						
							|  |  |  |  |             for c in sch: | 
					
						
							|  |  |  |  |                 if schools.is_good(c): | 
					
						
							|  |  |  |  |                     if "tag_kwd" not in cv: cv["tag_kwd"] = [] | 
					
						
							|  |  |  |  |                     cv["tag_kwd"].append("好学校") | 
					
						
							|  |  |  |  |                     cv["tag_kwd"].append("好学历") | 
					
						
							|  |  |  |  |                     break | 
					
						
							|  |  |  |  |         if (len(cv.get("degree_kwd", [])) >= 1 and \ | 
					
						
							|  |  |  |  |             "本科" in cv["degree_kwd"] and \ | 
					
						
							|  |  |  |  |             any([d.lower() in ["硕士", "博士", "mba", "博士"] for d in cv.get("degree_kwd", [])])) \ | 
					
						
							|  |  |  |  |                 or all([d.lower() in ["硕士", "博士", "mba", "博士后"] for d in cv.get("degree_kwd", [])]) \ | 
					
						
							|  |  |  |  |                 or any([d in ["mba", "emba", "博士后"] for d in cv.get("degree_kwd", [])]): | 
					
						
							|  |  |  |  |             if "tag_kwd" not in cv: cv["tag_kwd"] = [] | 
					
						
							|  |  |  |  |             if "好学历" not in cv["tag_kwd"]: cv["tag_kwd"].append("好学历") | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |     if cv.get("major_kwd"): cv["major_tks"] = rag_tokenizer.tokenize(" ".join(maj)) | 
					
						
							|  |  |  |  |     if cv.get("school_name_kwd"): cv["school_name_tks"] = rag_tokenizer.tokenize(" ".join(sch)) | 
					
						
							|  |  |  |  |     if cv.get("first_school_name_kwd"): cv["first_school_name_tks"] = rag_tokenizer.tokenize(" ".join(fsch)) | 
					
						
							|  |  |  |  |     if cv.get("first_major_kwd"): cv["first_major_tks"] = rag_tokenizer.tokenize(" ".join(fmaj)) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     return cv | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def forProj(cv): | 
					
						
							|  |  |  |  |     if not cv.get("project_obj"): return cv | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     pro_nms, desc = [], [] | 
					
						
							|  |  |  |  |     for i, n in enumerate( | 
					
						
							|  |  |  |  |             sorted(cv.get("project_obj", []), key=lambda x: str(x.get("updated_at", "")) if type(x) == type({}) else "", | 
					
						
							|  |  |  |  |                    reverse=True)): | 
					
						
							|  |  |  |  |         if n.get("name"): pro_nms.append(n["name"]) | 
					
						
							|  |  |  |  |         if n.get("describe"): desc.append(str(n["describe"])) | 
					
						
							|  |  |  |  |         if n.get("responsibilities"): desc.append(str(n["responsibilities"])) | 
					
						
							|  |  |  |  |         if n.get("achivement"): desc.append(str(n["achivement"])) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if pro_nms: | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |         # cv["pro_nms_tks"] = rag_tokenizer.tokenize(" ".join(pro_nms)) | 
					
						
							|  |  |  |  |         cv["project_name_tks"] = rag_tokenizer.tokenize(pro_nms[0]) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  |     if desc: | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |         cv["pro_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(" ".join(desc))) | 
					
						
							|  |  |  |  |         cv["project_desc_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(desc[0])) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     return cv | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def json_loads(line): | 
					
						
							| 
									
										
										
										
											2024-04-16 12:29:58 +08:00
										 |  |  |  |     return demjson3.decode(re.sub(r": *(True|False)", r": '\1'", line)) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def forWork(cv): | 
					
						
							|  |  |  |  |     if not cv.get("work_obj"): | 
					
						
							|  |  |  |  |         cv["integerity_flt"] *= 0.7 | 
					
						
							|  |  |  |  |         return cv | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     flds = ["position_name", "corporation_name", "corporation_id", "responsibilities", | 
					
						
							|  |  |  |  |             "industry_name", "subordinates_count"] | 
					
						
							|  |  |  |  |     duas = [] | 
					
						
							|  |  |  |  |     scales = [] | 
					
						
							|  |  |  |  |     fea = {c: [] for c in flds} | 
					
						
							|  |  |  |  |     latest_job_tm = "" | 
					
						
							|  |  |  |  |     goodcorp = False | 
					
						
							|  |  |  |  |     goodcorp_ = False | 
					
						
							|  |  |  |  |     work_st_tm = "" | 
					
						
							|  |  |  |  |     corp_tags = [] | 
					
						
							|  |  |  |  |     for i, n in enumerate( | 
					
						
							|  |  |  |  |             sorted(cv.get("work_obj", []), key=lambda x: str(x.get("start_time", "")) if type(x) == type({}) else "", | 
					
						
							|  |  |  |  |                    reverse=True)): | 
					
						
							|  |  |  |  |         if type(n) == type(""): | 
					
						
							|  |  |  |  |             try: | 
					
						
							|  |  |  |  |                 n = json_loads(n) | 
					
						
							|  |  |  |  |             except Exception as e: | 
					
						
							|  |  |  |  |                 continue | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if n.get("start_time") and (not work_st_tm or n["start_time"] < work_st_tm): work_st_tm = n["start_time"] | 
					
						
							|  |  |  |  |         for c in flds: | 
					
						
							|  |  |  |  |             if not n.get(c) or str(n[c]) == '0': | 
					
						
							|  |  |  |  |                 fea[c].append("") | 
					
						
							|  |  |  |  |                 continue | 
					
						
							|  |  |  |  |             if c == "corporation_name": | 
					
						
							|  |  |  |  |                 n[c] = corporations.corpNorm(n[c], False) | 
					
						
							|  |  |  |  |                 if corporations.is_good(n[c]): | 
					
						
							|  |  |  |  |                     if i == 0: | 
					
						
							|  |  |  |  |                         goodcorp = True | 
					
						
							|  |  |  |  |                     else: | 
					
						
							|  |  |  |  |                         goodcorp_ = True | 
					
						
							|  |  |  |  |                 ct = corporations.corp_tag(n[c]) | 
					
						
							|  |  |  |  |                 if i == 0: | 
					
						
							|  |  |  |  |                     corp_tags.extend(ct) | 
					
						
							|  |  |  |  |                 elif ct and ct[0] != "软外": | 
					
						
							|  |  |  |  |                     corp_tags.extend([f"{t}(曾)" for t in ct]) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |             fea[c].append(rmHtmlTag(str(n[c]).lower())) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         y, m, d = getYMD(n.get("start_time")) | 
					
						
							|  |  |  |  |         if not y or not m: continue | 
					
						
							|  |  |  |  |         st = "%s-%02d-%02d" % (y, int(m), int(d)) | 
					
						
							|  |  |  |  |         latest_job_tm = st | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         y, m, d = getYMD(n.get("end_time")) | 
					
						
							|  |  |  |  |         if (not y or not m) and i > 0: continue | 
					
						
							|  |  |  |  |         if not y or not m or int(y) > 2022:  y, m, d = getYMD(str(n.get("updated_at", ""))) | 
					
						
							|  |  |  |  |         if not y or not m: continue | 
					
						
							|  |  |  |  |         ed = "%s-%02d-%02d" % (y, int(m), int(d)) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         try: | 
					
						
							|  |  |  |  |             duas.append((datetime.datetime.strptime(ed, "%Y-%m-%d") - datetime.datetime.strptime(st, "%Y-%m-%d")).days) | 
					
						
							|  |  |  |  |         except Exception as e: | 
					
						
							|  |  |  |  |             print("kkkkkkkkkkkkkkkkkkkk", n.get("start_time"), n.get("end_time")) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if n.get("scale"): | 
					
						
							|  |  |  |  |             r = re.search(r"^([0-9]+)", str(n["scale"])) | 
					
						
							|  |  |  |  |             if r: scales.append(int(r.group(1))) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if goodcorp: | 
					
						
							|  |  |  |  |         if "tag_kwd" not in cv: cv["tag_kwd"] = [] | 
					
						
							|  |  |  |  |         cv["tag_kwd"].append("好公司") | 
					
						
							|  |  |  |  |     if goodcorp_: | 
					
						
							|  |  |  |  |         if "tag_kwd" not in cv: cv["tag_kwd"] = [] | 
					
						
							|  |  |  |  |         cv["tag_kwd"].append("好公司(曾)") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if corp_tags: | 
					
						
							|  |  |  |  |         if "tag_kwd" not in cv: cv["tag_kwd"] = [] | 
					
						
							|  |  |  |  |         cv["tag_kwd"].extend(corp_tags) | 
					
						
							|  |  |  |  |         cv["corp_tag_kwd"] = [c for c in corp_tags if re.match(r"(综合|行业)", c)] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if latest_job_tm: cv["latest_job_dt"] = latest_job_tm | 
					
						
							|  |  |  |  |     if fea["corporation_id"]: cv["corporation_id"] = fea["corporation_id"] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if fea["position_name"]: | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |         cv["position_name_tks"] = rag_tokenizer.tokenize(fea["position_name"][0]) | 
					
						
							|  |  |  |  |         cv["position_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["position_name_tks"]) | 
					
						
							|  |  |  |  |         cv["pos_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["position_name"][1:])) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     if fea["industry_name"]: | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |         cv["industry_name_tks"] = rag_tokenizer.tokenize(fea["industry_name"][0]) | 
					
						
							|  |  |  |  |         cv["industry_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["industry_name_tks"]) | 
					
						
							|  |  |  |  |         cv["indu_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["industry_name"][1:])) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     if fea["corporation_name"]: | 
					
						
							|  |  |  |  |         cv["corporation_name_kwd"] = fea["corporation_name"][0] | 
					
						
							|  |  |  |  |         cv["corp_nm_kwd"] = fea["corporation_name"] | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |         cv["corporation_name_tks"] = rag_tokenizer.tokenize(fea["corporation_name"][0]) | 
					
						
							|  |  |  |  |         cv["corporation_name_sm_tks"] = rag_tokenizer.fine_grained_tokenize(cv["corporation_name_tks"]) | 
					
						
							|  |  |  |  |         cv["corp_nm_tks"] = rag_tokenizer.tokenize(" ".join(fea["corporation_name"][1:])) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     if fea["responsibilities"]: | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |         cv["responsibilities_ltks"] = rag_tokenizer.tokenize(fea["responsibilities"][0]) | 
					
						
							|  |  |  |  |         cv["resp_ltks"] = rag_tokenizer.tokenize(" ".join(fea["responsibilities"][1:])) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     if fea["subordinates_count"]: fea["subordinates_count"] = [int(i) for i in fea["subordinates_count"] if | 
					
						
							|  |  |  |  |                                                                re.match(r"[^0-9]+$", str(i))] | 
					
						
							|  |  |  |  |     if fea["subordinates_count"]: cv["max_sub_cnt_int"] = np.max(fea["subordinates_count"]) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if type(cv.get("corporation_id")) == type(1): cv["corporation_id"] = [str(cv["corporation_id"])] | 
					
						
							|  |  |  |  |     if not cv.get("corporation_id"): cv["corporation_id"] = [] | 
					
						
							|  |  |  |  |     for i in cv.get("corporation_id", []): | 
					
						
							|  |  |  |  |         cv["baike_flt"] = max(corporations.baike(i), cv["baike_flt"] if "baike_flt" in cv else 0) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if work_st_tm: | 
					
						
							|  |  |  |  |         try: | 
					
						
							|  |  |  |  |             if re.match(r"[0-9]{9,}", work_st_tm): work_st_tm = turnTm2Dt(work_st_tm) | 
					
						
							|  |  |  |  |             y, m, d = getYMD(work_st_tm) | 
					
						
							|  |  |  |  |             cv["work_exp_flt"] = min(int(str(datetime.date.today())[0:4]) - int(y), cv.get("work_exp_flt", 1000)) | 
					
						
							|  |  |  |  |         except Exception as e: | 
					
						
							|  |  |  |  |             print("EXCEPTION: ", e, work_st_tm, cv.get("work_exp_flt")) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     cv["job_num_int"] = 0 | 
					
						
							|  |  |  |  |     if duas: | 
					
						
							|  |  |  |  |         cv["dua_flt"] = np.mean(duas) | 
					
						
							|  |  |  |  |         cv["cur_dua_int"] = duas[0] | 
					
						
							|  |  |  |  |         cv["job_num_int"] = len(duas) | 
					
						
							|  |  |  |  |     if scales: cv["scale_flt"] = np.max(scales) | 
					
						
							|  |  |  |  |     return cv | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def turnTm2Dt(b): | 
					
						
							|  |  |  |  |     if not b: return | 
					
						
							|  |  |  |  |     b = str(b).strip() | 
					
						
							|  |  |  |  |     if re.match(r"[0-9]{10,}", b): b = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(b[:10]))) | 
					
						
							|  |  |  |  |     return b | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def getYMD(b): | 
					
						
							|  |  |  |  |     y, m, d = "", "", "01" | 
					
						
							|  |  |  |  |     if not b: return (y, m, d) | 
					
						
							|  |  |  |  |     b = turnTm2Dt(b) | 
					
						
							|  |  |  |  |     if re.match(r"[0-9]{4}", b): y = int(b[:4]) | 
					
						
							|  |  |  |  |     r = re.search(r"[0-9]{4}.?([0-9]{1,2})", b) | 
					
						
							|  |  |  |  |     if r: m = r.group(1) | 
					
						
							|  |  |  |  |     r = re.search(r"[0-9]{4}.?[0-9]{,2}.?([0-9]{1,2})", b) | 
					
						
							|  |  |  |  |     if r: d = r.group(1) | 
					
						
							|  |  |  |  |     if not d or int(d) == 0 or int(d) > 31: d = "1" | 
					
						
							|  |  |  |  |     if not m or int(m) > 12 or int(m) < 1: m = "1" | 
					
						
							|  |  |  |  |     return (y, m, d) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def birth(cv): | 
					
						
							|  |  |  |  |     if not cv.get("birth"): | 
					
						
							|  |  |  |  |         cv["integerity_flt"] *= 0.9 | 
					
						
							|  |  |  |  |         return cv | 
					
						
							|  |  |  |  |     y, m, d = getYMD(cv["birth"]) | 
					
						
							|  |  |  |  |     if not m or not y: return cv | 
					
						
							|  |  |  |  |     b = "%s-%02d-%02d" % (y, int(m), int(d)) | 
					
						
							|  |  |  |  |     cv["birth_dt"] = b | 
					
						
							|  |  |  |  |     cv["birthday_kwd"] = "%02d%02d" % (int(m), int(d)) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     cv["age_int"] = datetime.datetime.now().year - int(y) | 
					
						
							|  |  |  |  |     return cv | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def parse(cv): | 
					
						
							|  |  |  |  |     for k in cv.keys(): | 
					
						
							|  |  |  |  |         if cv[k] == '\\N': cv[k] = '' | 
					
						
							|  |  |  |  |     # cv = cv.asDict() | 
					
						
							|  |  |  |  |     tks_fld = ["address", "corporation_name", "discipline_name", "email", "expect_city_names", | 
					
						
							|  |  |  |  |                "expect_industry_name", "expect_position_name", "industry_name", "industry_names", "name", | 
					
						
							|  |  |  |  |                "position_name", "school_name", "self_remark", "title_name"] | 
					
						
							|  |  |  |  |     small_tks_fld = ["corporation_name", "expect_position_name", "position_name", "school_name", "title_name"] | 
					
						
							|  |  |  |  |     kwd_fld = ["address", "city", "corporation_type", "degree", "discipline_name", "expect_city_names", "email", | 
					
						
							|  |  |  |  |                "expect_industry_name", "expect_position_name", "expect_type", "gender", "industry_name", | 
					
						
							|  |  |  |  |                "industry_names", "political_status", "position_name", "scale", "school_name", "phone", "tel"] | 
					
						
							|  |  |  |  |     num_fld = ["annual_salary", "annual_salary_from", "annual_salary_to", "expect_annual_salary", "expect_salary_from", | 
					
						
							|  |  |  |  |                "expect_salary_to", "salary_month"] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     is_fld = [ | 
					
						
							|  |  |  |  |         ("is_fertility", "已育", "未育"), | 
					
						
							|  |  |  |  |         ("is_house", "有房", "没房"), | 
					
						
							|  |  |  |  |         ("is_management_experience", "有管理经验", "无管理经验"), | 
					
						
							|  |  |  |  |         ("is_marital", "已婚", "未婚"), | 
					
						
							|  |  |  |  |         ("is_oversea", "有海外经验", "无海外经验") | 
					
						
							|  |  |  |  |     ] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     rmkeys = [] | 
					
						
							|  |  |  |  |     for k in cv.keys(): | 
					
						
							|  |  |  |  |         if cv[k] is None: rmkeys.append(k) | 
					
						
							|  |  |  |  |         if (type(cv[k]) == type([]) or type(cv[k]) == type("")) and len(cv[k]) == 0: rmkeys.append(k) | 
					
						
							|  |  |  |  |     for k in rmkeys: del cv[k] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     integerity = 0. | 
					
						
							|  |  |  |  |     flds_num = 0. | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     def hasValues(flds): | 
					
						
							|  |  |  |  |         nonlocal integerity, flds_num | 
					
						
							|  |  |  |  |         flds_num += len(flds) | 
					
						
							|  |  |  |  |         for f in flds: | 
					
						
							|  |  |  |  |             v = str(cv.get(f, "")) | 
					
						
							|  |  |  |  |             if len(v) > 0 and v != '0' and v != '[]': integerity += 1 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     hasValues(tks_fld) | 
					
						
							|  |  |  |  |     hasValues(small_tks_fld) | 
					
						
							|  |  |  |  |     hasValues(kwd_fld) | 
					
						
							|  |  |  |  |     hasValues(num_fld) | 
					
						
							|  |  |  |  |     cv["integerity_flt"] = integerity / flds_num | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if cv.get("corporation_type"): | 
					
						
							|  |  |  |  |         for p, r in [(r"(公司|企业|其它|其他|Others*|\n|未填写|Enterprises|Company|companies)", ""), | 
					
						
							|  |  |  |  |                      (r"[//.· <\((]+.*", ""), | 
					
						
							|  |  |  |  |                      (r".*(合资|民企|股份制|中外|私营|个体|Private|创业|Owned|投资).*", "民营"), | 
					
						
							|  |  |  |  |                      (r".*(机关|事业).*", "机关"), | 
					
						
							|  |  |  |  |                      (r".*(非盈利|Non-profit).*", "非盈利"), | 
					
						
							|  |  |  |  |                      (r".*(外企|外商|欧美|foreign|Institution|Australia|港资).*", "外企"), | 
					
						
							|  |  |  |  |                      (r".*国有.*", "国企"), | 
					
						
							|  |  |  |  |                      (r"[ ()\(\)人/·0-9-]+", ""), | 
					
						
							|  |  |  |  |                      (r".*(元|规模|于|=|北京|上海|至今|中国|工资|州|shanghai|强|餐饮|融资|职).*", "")]: | 
					
						
							|  |  |  |  |             cv["corporation_type"] = re.sub(p, r, cv["corporation_type"], 1000, re.IGNORECASE) | 
					
						
							|  |  |  |  |         if len(cv["corporation_type"]) < 2: del cv["corporation_type"] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if cv.get("political_status"): | 
					
						
							|  |  |  |  |         for p, r in [ | 
					
						
							|  |  |  |  |             (r".*党员.*", "党员"), | 
					
						
							|  |  |  |  |             (r".*(无党派|公民).*", "群众"), | 
					
						
							|  |  |  |  |             (r".*团员.*", "团员")]: | 
					
						
							|  |  |  |  |             cv["political_status"] = re.sub(p, r, cv["political_status"]) | 
					
						
							|  |  |  |  |         if not re.search(r"[党团群]", cv["political_status"]): del cv["political_status"] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if cv.get("phone"): cv["phone"] = re.sub(r"^0*86([0-9]{11})", r"\1", re.sub(r"[^0-9]+", "", cv["phone"])) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     keys = list(cv.keys()) | 
					
						
							|  |  |  |  |     for k in keys: | 
					
						
							|  |  |  |  |         # deal with json objects | 
					
						
							|  |  |  |  |         if k.find("_obj") > 0: | 
					
						
							|  |  |  |  |             try: | 
					
						
							|  |  |  |  |                 cv[k] = json_loads(cv[k]) | 
					
						
							|  |  |  |  |                 cv[k] = [a for _, a in cv[k].items()] | 
					
						
							|  |  |  |  |                 nms = [] | 
					
						
							|  |  |  |  |                 for n in cv[k]: | 
					
						
							|  |  |  |  |                     if type(n) != type({}) or "name" not in n or not n.get("name"): continue | 
					
						
							|  |  |  |  |                     n["name"] = re.sub(r"((442)|\t )", "", n["name"]).strip().lower() | 
					
						
							|  |  |  |  |                     if not n["name"]: continue | 
					
						
							|  |  |  |  |                     nms.append(n["name"]) | 
					
						
							|  |  |  |  |                 if nms: | 
					
						
							|  |  |  |  |                     t = k[:-4] | 
					
						
							|  |  |  |  |                     cv[f"{t}_kwd"] = nms | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |                     cv[f"{t}_tks"] = rag_tokenizer.tokenize(" ".join(nms)) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  |             except Exception as e: | 
					
						
							|  |  |  |  |                 print("【EXCEPTION】:", str(traceback.format_exc()), cv[k]) | 
					
						
							|  |  |  |  |                 cv[k] = [] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         # tokenize fields | 
					
						
							|  |  |  |  |         if k in tks_fld: | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |             cv[f"{k}_tks"] = rag_tokenizer.tokenize(cv[k]) | 
					
						
							|  |  |  |  |             if k in small_tks_fld: cv[f"{k}_sm_tks"] = rag_tokenizer.tokenize(cv[f"{k}_tks"]) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |         # keyword fields | 
					
						
							|  |  |  |  |         if k in kwd_fld: cv[f"{k}_kwd"] = [n.lower() | 
					
						
							|  |  |  |  |                                            for n in re.split(r"[\t,,;;. ]", | 
					
						
							|  |  |  |  |                                                              re.sub(r"([^a-zA-Z])[ ]+([^a-zA-Z ])", r"\1,\2", cv[k]) | 
					
						
							|  |  |  |  |                                                              ) if n] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         if k in num_fld and cv.get(k): cv[f"{k}_int"] = cv[k] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     cv["email_kwd"] = cv.get("email_tks", "").replace(" ", "") | 
					
						
							|  |  |  |  |     # for name field | 
					
						
							|  |  |  |  |     if cv.get("name"): | 
					
						
							|  |  |  |  |         nm = re.sub(r"[\n——\-\((\+].*", "", cv["name"].strip()) | 
					
						
							|  |  |  |  |         nm = re.sub(r"[ \t ]+", " ", nm) | 
					
						
							|  |  |  |  |         if re.match(r"[a-zA-Z ]+$", nm): | 
					
						
							|  |  |  |  |             if len(nm.split(" ")) > 1: | 
					
						
							|  |  |  |  |                 cv["name"] = nm | 
					
						
							|  |  |  |  |             else: | 
					
						
							|  |  |  |  |                 nm = "" | 
					
						
							|  |  |  |  |         elif nm and (surname.isit(nm[0]) or surname.isit(nm[:2])): | 
					
						
							|  |  |  |  |             nm = re.sub(r"[a-zA-Z]+.*", "", nm[:5]) | 
					
						
							|  |  |  |  |         else: | 
					
						
							|  |  |  |  |             nm = "" | 
					
						
							|  |  |  |  |         cv["name"] = nm.strip() | 
					
						
							|  |  |  |  |         name = cv["name"] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         # name pingyin and its prefix | 
					
						
							|  |  |  |  |         cv["name_py_tks"] = " ".join(PY.get_pinyins(nm[:20], '')) + " " + " ".join(PY.get_pinyins(nm[:20], ' ')) | 
					
						
							|  |  |  |  |         cv["name_py_pref0_tks"] = "" | 
					
						
							|  |  |  |  |         cv["name_py_pref_tks"] = "" | 
					
						
							|  |  |  |  |         for py in PY.get_pinyins(nm[:20], ''): | 
					
						
							|  |  |  |  |             for i in range(2, len(py) + 1): cv["name_py_pref_tks"] += " " + py[:i] | 
					
						
							|  |  |  |  |         for py in PY.get_pinyins(nm[:20], ' '): | 
					
						
							|  |  |  |  |             py = py.split(" ") | 
					
						
							|  |  |  |  |             for i in range(1, len(py) + 1): cv["name_py_pref0_tks"] += " " + "".join(py[:i]) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |         cv["name_kwd"] = name | 
					
						
							|  |  |  |  |         cv["name_pinyin_kwd"] = PY.get_pinyins(nm[:20], ' ')[:3] | 
					
						
							|  |  |  |  |         cv["name_tks"] = ( | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |                 rag_tokenizer.tokenize(name) + " " + (" ".join(list(name)) if not re.match(r"[a-zA-Z ]+$", name) else "") | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  |         ) if name else "" | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         cv["integerity_flt"] /= 2. | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if cv.get("phone"): | 
					
						
							|  |  |  |  |         r = re.search(r"(1[3456789][0-9]{9})", cv["phone"]) | 
					
						
							|  |  |  |  |         if not r: | 
					
						
							|  |  |  |  |             cv["phone"] = "" | 
					
						
							|  |  |  |  |         else: | 
					
						
							|  |  |  |  |             cv["phone"] = r.group(1) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     # deal with date  fields | 
					
						
							|  |  |  |  |     if cv.get("updated_at") and isinstance(cv["updated_at"], datetime.datetime): | 
					
						
							|  |  |  |  |         cv["updated_at_dt"] = cv["updated_at"].strftime('%Y-%m-%d %H:%M:%S') | 
					
						
							|  |  |  |  |     else: | 
					
						
							|  |  |  |  |         y, m, d = getYMD(str(cv.get("updated_at", ""))) | 
					
						
							|  |  |  |  |         if not y: y = "2012" | 
					
						
							|  |  |  |  |         if not m: m = "01" | 
					
						
							|  |  |  |  |         if not d: d = "01" | 
					
						
							|  |  |  |  |         cv["updated_at_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) | 
					
						
							|  |  |  |  |         # long text tokenize | 
					
						
							|  |  |  |  | 
 | 
					
						
							| 
									
										
										
										
											2024-04-28 19:13:33 +08:00
										 |  |  |  |     if cv.get("responsibilities"): cv["responsibilities_ltks"] = rag_tokenizer.tokenize(rmHtmlTag(cv["responsibilities"])) | 
					
						
							| 
									
										
										
										
											2024-02-23 18:28:12 +08:00
										 |  |  |  | 
 | 
					
						
							|  |  |  |  |     # for yes or no field | 
					
						
							|  |  |  |  |     fea = [] | 
					
						
							|  |  |  |  |     for f, y, n in is_fld: | 
					
						
							|  |  |  |  |         if f not in cv: continue | 
					
						
							|  |  |  |  |         if cv[f] == '是': fea.append(y) | 
					
						
							|  |  |  |  |         if cv[f] == '否': fea.append(n) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if fea: cv["tag_kwd"] = fea | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     cv = forEdu(cv) | 
					
						
							|  |  |  |  |     cv = forProj(cv) | 
					
						
							|  |  |  |  |     cv = forWork(cv) | 
					
						
							|  |  |  |  |     cv = birth(cv) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     cv["corp_proj_sch_deg_kwd"] = [c for c in cv.get("corp_tag_kwd", [])] | 
					
						
							|  |  |  |  |     for i in range(len(cv["corp_proj_sch_deg_kwd"])): | 
					
						
							|  |  |  |  |         for j in cv.get("sch_rank_kwd", []): cv["corp_proj_sch_deg_kwd"][i] += "+" + j | 
					
						
							|  |  |  |  |     for i in range(len(cv["corp_proj_sch_deg_kwd"])): | 
					
						
							|  |  |  |  |         if cv.get("highest_degree_kwd"): cv["corp_proj_sch_deg_kwd"][i] += "+" + cv["highest_degree_kwd"] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     try: | 
					
						
							|  |  |  |  |         if not cv.get("work_exp_flt") and cv.get("work_start_time"): | 
					
						
							|  |  |  |  |             if re.match(r"[0-9]{9,}", str(cv["work_start_time"])): | 
					
						
							|  |  |  |  |                 cv["work_start_dt"] = turnTm2Dt(cv["work_start_time"]) | 
					
						
							|  |  |  |  |                 cv["work_exp_flt"] = (time.time() - int(int(cv["work_start_time"]) / 1000)) / 3600. / 24. / 365. | 
					
						
							|  |  |  |  |             elif re.match(r"[0-9]{4}[^0-9]", str(cv["work_start_time"])): | 
					
						
							|  |  |  |  |                 y, m, d = getYMD(str(cv["work_start_time"])) | 
					
						
							|  |  |  |  |                 cv["work_start_dt"] = f"%s-%02d-%02d 00:00:00" % (y, int(m), int(d)) | 
					
						
							|  |  |  |  |                 cv["work_exp_flt"] = int(str(datetime.date.today())[0:4]) - int(y) | 
					
						
							|  |  |  |  |     except Exception as e: | 
					
						
							|  |  |  |  |         print("【EXCEPTION】", e, "==>", cv.get("work_start_time")) | 
					
						
							|  |  |  |  |     if "work_exp_flt" not in cv and cv.get("work_experience", 0): cv["work_exp_flt"] = int(cv["work_experience"]) / 12. | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     keys = list(cv.keys()) | 
					
						
							|  |  |  |  |     for k in keys: | 
					
						
							|  |  |  |  |         if not re.search(r"_(fea|tks|nst|dt|int|flt|ltks|kwd|id)$", k): del cv[k] | 
					
						
							|  |  |  |  |     for k in cv.keys(): | 
					
						
							|  |  |  |  |         if not re.search("_(kwd|id)$", k) or type(cv[k]) != type([]): continue | 
					
						
							|  |  |  |  |         cv[k] = list(set([re.sub("(市)$", "", str(n)) for n in cv[k] if n not in ['中国', '0']])) | 
					
						
							|  |  |  |  |     keys = [k for k in cv.keys() if re.search(r"_feas*$", k)] | 
					
						
							|  |  |  |  |     for k in keys: | 
					
						
							|  |  |  |  |         if cv[k] <= 0: del cv[k] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     cv["tob_resume_id"] = str(cv["tob_resume_id"]) | 
					
						
							|  |  |  |  |     cv["id"] = cv["tob_resume_id"] | 
					
						
							|  |  |  |  |     print("CCCCCCCCCCCCCCC") | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     return dealWithInt64(cv) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  | def dealWithInt64(d): | 
					
						
							|  |  |  |  |     if isinstance(d, dict): | 
					
						
							|  |  |  |  |         for n, v in d.items(): | 
					
						
							|  |  |  |  |             d[n] = dealWithInt64(v) | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if isinstance(d, list): | 
					
						
							|  |  |  |  |         d = [dealWithInt64(t) for t in d] | 
					
						
							|  |  |  |  | 
 | 
					
						
							|  |  |  |  |     if isinstance(d, np.integer): d = int(d) | 
					
						
							|  |  |  |  |     return d | 
					
						
							|  |  |  |  | 
 |