OttoYu commited on
Commit
b98028a
·
verified ·
1 Parent(s): a60b268

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -46
app.py CHANGED
@@ -5,12 +5,42 @@ import aiohttp
5
  import asyncio
6
  import ssl
7
  from aiohttp import ClientSession
 
8
 
9
  ssl_context = ssl.create_default_context()
10
  ssl_context.check_hostname = False
11
  ssl_context.verify_mode = ssl.CERT_NONE
12
 
13
- def setup_jieba_dictionaries():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  dictionaries = [
15
  'flag/RVT_AddressCh.txt',
16
  'flag/RVT_AddressEn.txt',
@@ -19,19 +49,6 @@ def setup_jieba_dictionaries():
19
  'flag/RVT_BuildingEn.txt'
20
  ]
21
  for file_path in dictionaries:
22
- jieba.load_userdict(file_path)
23
-
24
- def process_text(text):
25
- setup_jieba_dictionaries()
26
-
27
- user_dict_terms = set()
28
- for file_path in [
29
- 'flag/RVT_AddressCh.txt',
30
- 'flag/RVT_AddressEn.txt',
31
- 'flag/RVT_Area.txt',
32
- 'flag/RVT_BuildingCh.txt',
33
- 'flag/RVT_BuildingEn.txt'
34
- ]:
35
  try:
36
  with open(file_path, 'r', encoding='utf-8') as f:
37
  user_dict_terms.update(line.strip().split()[0] for line in f)
@@ -39,6 +56,22 @@ def process_text(text):
39
  print(f'File not found: {file_path}')
40
  except Exception as e:
41
  print(f'Error reading file {file_path}: {e}')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
 
43
  lines = text.splitlines()
44
  results = []
@@ -52,14 +85,12 @@ def process_text(text):
52
  return results
53
 
54
  def reformat_text(text):
55
- lines = text.splitlines()
56
- return [line.strip() for line in lines if line.strip()]
57
 
58
  def process_text_only(text, reformat):
59
  extracted_keywords = process_text(text)
60
  if reformat:
61
  extracted_keywords = reformat_text('\n'.join(extracted_keywords))
62
- # Join keywords with newline characters
63
  return '\n'.join(extracted_keywords)
64
 
65
  async def lookup_address(query, language='zh-Hant'):
@@ -89,44 +120,51 @@ async def lookup_address(query, language='zh-Hant'):
89
 
90
  async def get_address_lookup_results(keywords):
91
  results = []
 
 
92
  for keyword in keywords:
93
  keyword = keyword.strip()
94
- if not keyword: # Skip empty keywords
95
  continue
96
- lookup_results = await lookup_address(keyword)
97
- if 'SuggestedAddress' in lookup_results and isinstance(lookup_results['SuggestedAddress'], list):
98
- first_match = lookup_results['SuggestedAddress'][0] # Use the first match
99
- full_address = 'No matches found'
100
- geo_address = 'N/A'
101
- latitude = 'N/A'
102
- longitude = 'N/A'
103
- matched_building = 'No Building Name'
 
 
 
 
 
104
 
105
  if first_match:
106
  premises_address = first_match['Address']['PremisesAddress']
107
  raw_address = premises_address.get('ChiPremisesAddress', {})
108
  matched_building = raw_address.get('BuildingName', 'No Building Name')
109
- full_address = matched_building
110
- geo_address = premises_address.get('GeoAddress', 'N/A')
 
 
 
 
 
 
111
  geo_info = premises_address.get('GeospatialInformation', {})
112
- latitude = geo_info.get('Latitude', 'N/A')
113
- longitude = geo_info.get('Longitude', 'N/A')
114
-
115
- results.append({
116
- 'Keyword': keyword,
117
- 'Full Address': full_address,
118
- 'Geo Address': geo_address,
119
- 'Latitude': latitude,
120
- 'Longitude': longitude
121
- })
122
  else:
123
- results.append({
124
- 'Keyword': keyword,
125
- 'Full Address': 'No matches found',
126
- 'Geo Address': 'N/A',
127
- 'Latitude': 'N/A',
128
- 'Longitude': 'N/A'
129
- })
130
 
131
  return results
132
 
@@ -140,6 +178,15 @@ async def gradio_function(text, reformat, perform_lookup):
140
 
141
  return extracted_keywords, address_results
142
 
 
 
 
 
 
 
 
 
 
143
  def gradio_interface(text, reformat, perform_lookup):
144
  return asyncio.run(gradio_function(text, reformat, perform_lookup))
145
 
@@ -158,4 +205,4 @@ interface = gr.Interface(
158
  description="Extract address keywords using NLP and optionally perform address lookup using ALS."
159
  )
160
 
161
- interface.launch()
 
5
  import asyncio
6
  import ssl
7
  from aiohttp import ClientSession
8
+ from functools import lru_cache
9
 
10
  ssl_context = ssl.create_default_context()
11
  ssl_context.check_hostname = False
12
  ssl_context.verify_mode = ssl.CERT_NONE
13
 
14
+ area_data = {
15
+ '香港': {
16
+ '中西區': ["西環", "堅尼地城", "石塘咀", "西營盤", "上環", "中環", "金鐘", "西半山", "中半山", "半山", "山頂"],
17
+ '灣仔': ["灣仔", "銅鑼灣", "跑馬地", "大坑", "掃桿埔", "渣甸山"],
18
+ '東區': ["天后", "寶馬山", "北角", "鰂魚涌", "西灣河", "筲箕灣", "柴灣", "小西灣"],
19
+ '南區': ["薄扶林", "香港仔", "鴨脷洲", "黃竹坑", "壽臣山", "淺水灣", "舂磡角", "赤柱", "大潭", "石澳", "田灣"]
20
+ },
21
+ '九龍': {
22
+ '油尖旺': ["尖沙咀", "油麻地", "西九龍", "京士柏", "旺角", "大角咀", "佐敦", "太子"],
23
+ '深水埗': ["美孚", "荔枝角", "長沙灣", "深水埗", "石硤尾", "又一村", "大窩坪", "昂船洲"],
24
+ '九龍城': ["紅磡", "土瓜灣", "馬頭角", "馬頭圍", "啟德", "九龍城", "何文田", "九龍塘", "筆架山"],
25
+ '黃大仙': ["新蒲崗", "黃大仙", "東頭", "橫頭磡", "樂富", "鑽石山", "慈雲山", "牛池灣"],
26
+ '觀塘': ["坪石", "九龍灣", "牛頭角", "佐敦谷", "觀塘", "秀茂坪", "藍田", "油塘", "鯉魚門"]
27
+ },
28
+ '新界': {
29
+ '葵青': ["葵涌", "青衣", "葵芳"],
30
+ '荃灣': ["荃灣", "梨木樹", "汀九", "深井", "青龍頭", "馬灣", "欣澳"],
31
+ '屯門': ["大欖涌", "掃管笏", "屯門", "藍地"],
32
+ '元朗': ["洪水橋", "廈村", "流浮山", "天水圍", "元朗", "新田", "落馬洲", "錦田", "石崗", "八鄉"],
33
+ '北區': ["粉嶺", "聯和墟", "上水", "石湖墟", "沙頭角", "鹿頸", "烏蛟騰"],
34
+ '大埔': ["大埔墟", "大埔", "大埔滘", "大尾篤", "船灣", "樟木頭", "企嶺下", "太和"],
35
+ '沙田': ["大圍", "沙田", "火炭", "馬料水", "烏溪沙", "馬鞍山"],
36
+ '西貢': ["清水灣", "西貢", "大網仔", "將軍澳", "坑口", "調景嶺", "馬游塘"],
37
+ '離島': ["長洲", "坪洲", "大嶼山", "東涌", "南丫島"]
38
+ }
39
+ }
40
+
41
+ @lru_cache(maxsize=None)
42
+ def load_user_dict_terms():
43
+ user_dict_terms = set()
44
  dictionaries = [
45
  'flag/RVT_AddressCh.txt',
46
  'flag/RVT_AddressEn.txt',
 
49
  'flag/RVT_BuildingEn.txt'
50
  ]
51
  for file_path in dictionaries:
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  try:
53
  with open(file_path, 'r', encoding='utf-8') as f:
54
  user_dict_terms.update(line.strip().split()[0] for line in f)
 
56
  print(f'File not found: {file_path}')
57
  except Exception as e:
58
  print(f'Error reading file {file_path}: {e}')
59
+ return user_dict_terms
60
+
61
+ def setup_jieba_dictionaries():
62
+ dictionaries = [
63
+ 'flag/RVT_AddressCh.txt',
64
+ 'flag/RVT_AddressEn.txt',
65
+ 'flag/RVT_Area.txt',
66
+ 'flag/RVT_BuildingCh.txt',
67
+ 'flag/RVT_BuildingEn.txt'
68
+ ]
69
+ for file_path in dictionaries:
70
+ jieba.load_userdict(file_path)
71
+
72
+ def process_text(text):
73
+ setup_jieba_dictionaries()
74
+ user_dict_terms = load_user_dict_terms()
75
 
76
  lines = text.splitlines()
77
  results = []
 
85
  return results
86
 
87
  def reformat_text(text):
88
+ return [line.strip() for line in text.splitlines() if line.strip()]
 
89
 
90
  def process_text_only(text, reformat):
91
  extracted_keywords = process_text(text)
92
  if reformat:
93
  extracted_keywords = reformat_text('\n'.join(extracted_keywords))
 
94
  return '\n'.join(extracted_keywords)
95
 
96
  async def lookup_address(query, language='zh-Hant'):
 
120
 
121
  async def get_address_lookup_results(keywords):
122
  results = []
123
+ tasks = []
124
+
125
  for keyword in keywords:
126
  keyword = keyword.strip()
127
+ if not keyword:
128
  continue
129
+
130
+ if not is_valid_for_lookup(keyword):
131
+
132
+ continue
133
+ else:
134
+ tasks.append(lookup_address(keyword))
135
+
136
+ lookup_results = await asyncio.gather(*tasks)
137
+
138
+ for keyword, lookup_result in zip(keywords, lookup_results):
139
+ if 'SuggestedAddress' in lookup_result and isinstance(lookup_result['SuggestedAddress'], list):
140
+ first_match = lookup_result['SuggestedAddress'][0]
141
+ result = {'Keyword': keyword}
142
 
143
  if first_match:
144
  premises_address = first_match['Address']['PremisesAddress']
145
  raw_address = premises_address.get('ChiPremisesAddress', {})
146
  matched_building = raw_address.get('BuildingName', 'No Building Name')
147
+
148
+ if matched_building != 'No Building Name':
149
+ result['Full Address'] = matched_building
150
+
151
+ geo_address = premises_address.get('GeoAddress')
152
+ if geo_address and geo_address != 'N/A':
153
+ result['Geo Address'] = geo_address
154
+
155
  geo_info = premises_address.get('GeospatialInformation', {})
156
+ latitude = geo_info.get('Latitude')
157
+ longitude = geo_info.get('Longitude')
158
+
159
+ if latitude and latitude != 'N/A':
160
+ result['Latitude'] = latitude
161
+ if longitude and longitude != 'N/A':
162
+ result['Longitude'] = longitude
163
+
164
+ if len(result) > 1: # Only add if there's more than just the Keyword
165
+ results.append(result)
166
  else:
167
+ pass
 
 
 
 
 
 
168
 
169
  return results
170
 
 
178
 
179
  return extracted_keywords, address_results
180
 
181
+ def is_valid_for_lookup(keyword):
182
+ for region, districts in area_data.items():
183
+ if keyword in districts.keys():
184
+ return False
185
+ for subdistricts in districts.values():
186
+ if keyword in subdistricts:
187
+ return False
188
+ return True
189
+
190
  def gradio_interface(text, reformat, perform_lookup):
191
  return asyncio.run(gradio_function(text, reformat, perform_lookup))
192
 
 
205
  description="Extract address keywords using NLP and optionally perform address lookup using ALS."
206
  )
207
 
208
+ interface.launch()