minskiter commited on
Commit
bcb2102
1 Parent(s): f13e3b8

fix(predictor): fix some error

Browse files
Files changed (3) hide show
  1. docker-compose.yml +1 -0
  2. predictor/__init__.py +45 -23
  3. server.py +16 -3
docker-compose.yml CHANGED
@@ -8,4 +8,5 @@ services:
8
  - "50050:50051"
9
  environment:
10
  - HF_Token=${HF_Token}
 
11
 
 
8
  - "50050:50051"
9
  environment:
10
  - HF_Token=${HF_Token}
11
+ - DEVICE=cpu
12
 
predictor/__init__.py CHANGED
@@ -46,7 +46,7 @@ class Predictor():
46
  + r"空,曾,毋,沙,乜,养,鞠,须,丰,巢,关,蒯,相,查,后,荆,红,游,竺,权,逑,盖,益,桓,公,万俟,司马,上官,欧阳,夏侯,诸葛,闻人,东方,赫连,皇甫,尉迟," \
47
  + r"公羊,澹台,公冶,宗政,濮阳,淳于,单于,太叔,申屠,公孙,仲孙,轩辕,令狐,锺离,宇文,长孙,慕容,鲜于,闾丘,司徒,司空,丌官,司寇,仉,督,子车," \
48
  + r"颛孙,端木,巫马,公西,漆雕,乐正,壤驷,公良,拓拔,夹谷,宰父,谷梁,晋,楚,阎,法,汝,鄢,涂,钦,段干,百里,东郭,南门,呼延,归,海,羊舌,微生,岳," \
49
- + r"帅,缑,亢,况,后,有,琴,梁丘,左丘,东门,西门,商,牟,佘,佴,伯,赏,南宫,墨,哈,谯,笪,年,爱,阳,佟,第五,言,福,邱]"
50
  first_name = r' {0,3}[\u4e00-\u9fa5]( {0,3}[\u4e00-\u9fa5]){0,3}'
51
  self.name_pattern = re.compile(last_name + first_name)
52
  self.phone_pattern = re.compile(r'1 {0,4}(3 {0,4}\d|4 {0,4}[5-9]|5 {0,4}[0-35-9]|6 {0,4}[2567]|7 {0,4}[0-8]|8 {0,4}\d|9 {0,4}[0-35-9]) {0,4}(\d {0,4}){8}')
@@ -106,14 +106,14 @@ class Predictor():
106
  def to_date(self, datestr:str):
107
  if re.match("^\d{4}$",datestr):
108
  return date(int(datestr),1,1)
109
- match = re.match("^\d{4}(\D)\d{1,2}",datestr)
110
  if match is not None:
111
  try:
112
- m = min(max(int(datestr.split(match.group(1))[1]),1),12)
113
- return date(int(datestr.split(match.group(1))[0]),m,1)
 
114
  except ValueError:
115
- print(int(datestr.split(match.group(1))[0]),int(datestr.split(match.group(1))[1]))
116
- raise
117
  if datestr=="至今":
118
  return self.today
119
  return None
@@ -206,25 +206,44 @@ class Predictor():
206
  # 获取名字,先过滤所有空白字符,防止名字中间有空格
207
  remove_blanks_text, index_mapper = self.remove_blanks(text, re.compile(r' '))
208
  start_time = time.perf_counter()
 
209
  for block in self.split_to_blocks(remove_blanks_text):
210
  block_text,block_l = block['text'],block['start']
211
  entities = self.pipelines['name'](block_text)
212
  for entity in entities:
213
- if entity['entity']=='NAME' and self.name_pattern.match(entity['word']) is not None:
214
- obj = {
215
- 'start': index_mapper[block_l+entity['start']],
216
- 'end': index_mapper[block_l+entity['end']-1]+1,
217
- 'entity': 'NAME',
218
- 'text': entity['word']
219
- }
220
- repeat = False
221
- for o in return_obj['name']:
222
- if obj['start']==o['start'] and obj['end']==o['end']:
223
- repeat = True
224
- break
225
- if not repeat:
226
- obj['origin'] = text[obj['start']:obj['end']]
227
- return_obj['name'].append(obj)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
  end_time = time.perf_counter()
229
  self.logger.info(f"process name time: {end_time-start_time}")
230
  # 获取年龄
@@ -301,6 +320,8 @@ class Predictor():
301
  break
302
  if not repeat:
303
  obj['origin'] = text[obj['start']:obj['end']]
 
 
304
  return_obj['schools'].append(obj)
305
  # 正则找学校
306
  for school_match in re.finditer(r"|".join(self.schools.keys()), remove_blanks_text):
@@ -309,7 +330,7 @@ class Predictor():
309
  'start': index_mapper[start],
310
  'end': index_mapper[end-1]+1,
311
  'entity': 'SCHOOL',
312
- 'text': school_match.group(),
313
  }
314
  repeat = False
315
  for o in return_obj['schools']:
@@ -320,6 +341,7 @@ class Predictor():
320
  obj['origin'] = text[obj['start']:obj['end']]
321
  obj['level'] = self.schools[obj['text']]
322
  return_obj['schools'].append(obj)
 
323
  end_time = time.perf_counter()
324
  self.logger.info(f"process school time: {end_time-start_time}")
325
  start_time = time.perf_counter()
@@ -410,7 +432,7 @@ class Predictor():
410
  diff_m = end.month-start.month
411
  work_month += diff_y * 12 + diff_m
412
  last_end = end
413
- return_obj['work_time'] = math.ceil(work_month/12)
414
  end_time = time.perf_counter()
415
  self.logger.info(f"process work time: {end_time-start_time}")
416
  start_time = time.perf_counter()
 
46
  + r"空,曾,毋,沙,乜,养,鞠,须,丰,巢,关,蒯,相,查,后,荆,红,游,竺,权,逑,盖,益,桓,公,万俟,司马,上官,欧阳,夏侯,诸葛,闻人,东方,赫连,皇甫,尉迟," \
47
  + r"公羊,澹台,公冶,宗政,濮阳,淳于,单于,太叔,申屠,公孙,仲孙,轩辕,令狐,锺离,宇文,长孙,慕容,鲜于,闾丘,司徒,司空,丌官,司寇,仉,督,子车," \
48
  + r"颛孙,端木,巫马,公西,漆雕,乐正,壤驷,公良,拓拔,夹谷,宰父,谷梁,晋,楚,阎,法,汝,鄢,涂,钦,段干,百里,东郭,南门,呼延,归,海,羊舌,微生,岳," \
49
+ + r"帅,缑,亢,况,后,有,琴,梁丘,左丘,东门,西门,商,牟,佘,佴,伯,赏,南宫,墨,哈,谯,笪,年,爱,阳,佟,第五,言,福,邱,钟]"
50
  first_name = r' {0,3}[\u4e00-\u9fa5]( {0,3}[\u4e00-\u9fa5]){0,3}'
51
  self.name_pattern = re.compile(last_name + first_name)
52
  self.phone_pattern = re.compile(r'1 {0,4}(3 {0,4}\d|4 {0,4}[5-9]|5 {0,4}[0-35-9]|6 {0,4}[2567]|7 {0,4}[0-8]|8 {0,4}\d|9 {0,4}[0-35-9]) {0,4}(\d {0,4}){8}')
 
106
  def to_date(self, datestr:str):
107
  if re.match("^\d{4}$",datestr):
108
  return date(int(datestr),1,1)
109
+ match = re.match("^(\d{4})\D(\d{1,2})",datestr)
110
  if match is not None:
111
  try:
112
+ y = int(match.group(1))
113
+ m = min(max(int(match.group(2)),1),12)
114
+ return date(y,m,1)
115
  except ValueError:
116
+ print(datestr)
 
117
  if datestr=="至今":
118
  return self.today
119
  return None
 
206
  # 获取名字,先过滤所有空白字符,防止名字中间有空格
207
  remove_blanks_text, index_mapper = self.remove_blanks(text, re.compile(r' '))
208
  start_time = time.perf_counter()
209
+ backup_name = []
210
  for block in self.split_to_blocks(remove_blanks_text):
211
  block_text,block_l = block['text'],block['start']
212
  entities = self.pipelines['name'](block_text)
213
  for entity in entities:
214
+ if entity['entity']=='NAME':
215
+ if self.name_pattern.match(entity['word']) is not None:
216
+ obj = {
217
+ 'start': index_mapper[block_l+entity['start']],
218
+ 'end': index_mapper[block_l+entity['end']-1]+1,
219
+ 'entity': 'NAME',
220
+ 'text': entity['word']
221
+ }
222
+ repeat = False
223
+ for o in return_obj['name']:
224
+ if obj['start']==o['start'] and obj['end']==o['end']:
225
+ repeat = True
226
+ break
227
+ if not repeat:
228
+ obj['origin'] = text[obj['start']:obj['end']]
229
+ return_obj['name'].append(obj)
230
+ else:
231
+ obj = {
232
+ 'start': index_mapper[block_l+entity['start']],
233
+ 'end': index_mapper[block_l+entity['end']-1]+1,
234
+ 'entity': 'NAME',
235
+ 'text': entity['word']
236
+ }
237
+ repeat = False
238
+ for o in return_obj['name']:
239
+ if obj['start']==o['start'] and obj['end']==o['end']:
240
+ repeat = True
241
+ break
242
+ if not repeat:
243
+ obj['origin'] = text[obj['start']:obj['end']]
244
+ backup_name.append(obj)
245
+ if len(return_obj['name'])==0:
246
+ return_obj['name'] = backup_name
247
  end_time = time.perf_counter()
248
  self.logger.info(f"process name time: {end_time-start_time}")
249
  # 获取年龄
 
320
  break
321
  if not repeat:
322
  obj['origin'] = text[obj['start']:obj['end']]
323
+ if "text" not in obj:
324
+ obj['text'] = obj['origin'].split("\n")[-1]
325
  return_obj['schools'].append(obj)
326
  # 正则找学校
327
  for school_match in re.finditer(r"|".join(self.schools.keys()), remove_blanks_text):
 
330
  'start': index_mapper[start],
331
  'end': index_mapper[end-1]+1,
332
  'entity': 'SCHOOL',
333
+ 'text': school_match.group().split('\n')[-1],
334
  }
335
  repeat = False
336
  for o in return_obj['schools']:
 
341
  obj['origin'] = text[obj['start']:obj['end']]
342
  obj['level'] = self.schools[obj['text']]
343
  return_obj['schools'].append(obj)
344
+ return_obj['schools'] = sorted(return_obj['schools'], key=lambda x: x['start'])
345
  end_time = time.perf_counter()
346
  self.logger.info(f"process school time: {end_time-start_time}")
347
  start_time = time.perf_counter()
 
432
  diff_m = end.month-start.month
433
  work_month += diff_y * 12 + diff_m
434
  last_end = end
435
+ return_obj['work_time'] = max(math.ceil(work_month/12),0)
436
  end_time = time.perf_counter()
437
  self.logger.info(f"process work time: {end_time-start_time}")
438
  start_time = time.perf_counter()
server.py CHANGED
@@ -11,6 +11,7 @@ from datetime import date
11
 
12
  HF_TOKEN = os.environ["HF_Token"]
13
  PORT = os.environ.get("PORT", "50051")
 
14
  login(HF_TOKEN)
15
 
16
  class Resume(protos.resume_pb2_grpc.ResumeServicer):
@@ -22,15 +23,27 @@ class Resume(protos.resume_pb2_grpc.ResumeServicer):
22
  pipeline=pipeline(
23
  "textencode",
24
  model="minskiter/cossim-bert-chinese-wwm-ext",
25
- device="cpu",
26
  trust_remote_code=True,
27
  use_auth_token=True
28
  )
29
  )
30
  self.predictor = Predictor(
31
  pipelines={
32
- "name": pipeline("nerpipe", model="minskiter/resume-token-classification-name-0708",trust_remote_code=True,use_auth_token=True),
33
- "common": pipeline("nerpipe",model="minskiter/resume-token-classification",trust_remote_code=True,use_auth_token=True)
 
 
 
 
 
 
 
 
 
 
 
 
34
  },
35
  paths=[
36
  "data/W020230619818476939351.xls",
 
11
 
12
  HF_TOKEN = os.environ["HF_Token"]
13
  PORT = os.environ.get("PORT", "50051")
14
+ DEVICE = os.environ.get("DEVICE", "cpu")
15
  login(HF_TOKEN)
16
 
17
  class Resume(protos.resume_pb2_grpc.ResumeServicer):
 
23
  pipeline=pipeline(
24
  "textencode",
25
  model="minskiter/cossim-bert-chinese-wwm-ext",
26
+ device=DEVICE,
27
  trust_remote_code=True,
28
  use_auth_token=True
29
  )
30
  )
31
  self.predictor = Predictor(
32
  pipelines={
33
+ "name": pipeline(
34
+ "nerpipe",
35
+ device=DEVICE,
36
+ model="minskiter/resume-token-classification-name-0708",
37
+ trust_remote_code=True,
38
+ use_auth_token=True
39
+ ),
40
+ "common": pipeline(
41
+ "nerpipe",
42
+ model="minskiter/resume-token-classification",
43
+ device=DEVICE,
44
+ trust_remote_code=True,
45
+ use_auth_token=True
46
+ )
47
  },
48
  paths=[
49
  "data/W020230619818476939351.xls",