import re from difflib import SequenceMatcher import requests import xml.etree.ElementTree as ET import gradio as gr from concurrent.futures import ThreadPoolExecutor areaData = { "Hong Kong": { "Central and Western": [ "Sai Ying Pun", "Kennedy Town", "Shek Tong Tsui", "Sai Wan", "Sheung Wan", "Central", "Admiralty", "Mid-Levels West", "Mid-Levels", "The Peak" ], "Wan Chai": [ "Wan Chai", "Causeway Bay", "Happy Valley", "Tai Hang", "Stubbs Road", "Jardine's Lookout" ], "Eastern": [ "Tin Hau", "Braemar Hill", "North Point", "Quarry Bay", "Sai Wan Ho", "Shau Kei Wan", "Chai Wan", "Siu Sai Wan" ], "Southern": [ "Pok Fu Lam", "Aberdeen", "Ap Lei Chau", "Wong Chuk Hang", "Shouson Hill", "Repulse Bay", "Chung Hom Kok", "Stanley", "Tai Tam", "Shek O", "Telegraph Bay" ] }, "Kowloon": { "Yau Tsim Mong": [ "Tsim Sha Tsui", "Yau Ma Tei", "West Kowloon", "Kowloon Tong", "Mong Kok", "Tai Kok Tsui", "Jordan", "Prince Edward" ], "Sham Shui Po": [ "Mei Foo", "Lai Chi Kok", "Cheung Sha Wan", "Sham Shui Po", "Shek Kip Mei", "Tai Wo Ping", "Stonecutters Island" ], "Kowloon City": [ "Hung Hom", "To Kwa Wan", "Ma Tau Kok", "Ma Tau Wai", "Kai Tak", "Kowloon City", "Ho Man Tin", "Kowloon Tong", "Beacon Hill" ], "Wong Tai Sin": [ "San Po Kong", "Wong Tai Sin", "Tung Tau", "Wang Tau Hom", "Lok Fu", "Diamond Hill", "Tsz Wan Shan", "Ngau Chi Wan" ], "Kwun Tong": [ "Ping Shek", "Kowloon Bay", "Ngau Tau Kok", "Tsz Wan Shan", "Kwun Tong", "Sau Mau Ping", "Lam Tin", "Yau Tong", "Lei Yue Mun" ] }, "New Territories": { "Kwai Tsing": [ "Kwai Chung", "Tsing Yi", "Kwai Fong" ], "Tsuen Wan": [ "Tsuen Wan", "Tsing Lung Bridge", "Tsing Hung Bridge", "Shen Tsuen", "Tsing Chung Koon", "Ma Wan", "Tsing Lung Bridge" ], "Tuen Mun": [ "Tai Lam Chung", "Siu Lam", "Tuen Mun", "Lam Tei" ], "Yuen Long": [ "Hung Shui Kiu", "Ha Tsuen", "Lau Fau Shan", "Tin Shui Wai", "Yuen Long", "San Tin", "Lok Ma Chau", "Kam Tin", "Shek Kong", "Pat Heung" ], "North": [ "Fanling", "Luen Wo Hui", "Sheung Shui", "Shek Wu Hui", "Sha Tau Kok", "Lok Keng", "Wu Kau Tang" ], "Tai Po": [ "Tai Po Market", "Tai Po", "Tai Po Kau", "Tai Mei Tuk", "Plover Cove", "Cheung Uk Tau", "Tai Wo" ], "Sha Tin": [ "Tai Wai", "Sha Tin", "Fo Tan", "Ma On Shan", "Shui Chuen O", "Ma On Shan" ], "Sai Kung": [ "Clear Water Bay", "Sai Kung", "Tai Mong Tsai", "Tseung Kwan O", "Hang Hau", "Tiu Keng Leng", "Ma Yau Tong" ], "Islands": [ "Cheung Chau", "Peng Chau", "Lantau Island", "Tung Chung", "Lamma Island" ] } } def normalize_text(text): return re.sub(r'\s+', ' ', text.lower().strip()) def normalize_address(address): return re.sub(r'[^\w\s]', '', re.sub(r'\s+', ' ', address)).strip().upper() def load_and_normalize_address_pool(file_paths): address_pool = [] for file_path in file_paths: try: with open(file_path, 'r') as f: for line in f: address = line.strip() if address: normalized = normalize_address(address) address_pool.append((address, normalized)) except FileNotFoundError: print(f"File not found: {file_path}") except Exception as e: print(f"Error reading file {file_path}: {e}") return address_pool def similarity(a, b): a, b = a.replace(' ', ''), b.replace(' ', '') return SequenceMatcher(None, a, b).ratio() def extract_relevant_part(user_input): number_part = re.findall(r'\d+', user_input) number_part = number_part[0] if number_part else '' address_part = re.sub(r'^\d+', '', user_input).strip() return number_part, address_part def match_address(user_input, address_pool): number_part, address_part = extract_relevant_part(user_input) normalized_input = normalize_address(address_part) best_match = None highest_similarity = 0 for original_address, normalized_address in address_pool: sim = similarity(normalized_input, normalized_address) if sim > highest_similarity: highest_similarity = sim best_match = original_address if best_match: best_match = f"{number_part} {best_match}".strip() if number_part else best_match return best_match, highest_similarity def fetch_address_from_als_api(user_input): api_url = f"https://www.als.gov.hk/lookup?q={requests.utils.quote(user_input)}" try: response = requests.get(api_url) response.raise_for_status() tree = ET.ElementTree(ET.fromstring(response.content)) root = tree.getroot() result = {} eng_premises = root.find(".//EngPremisesAddress") if eng_premises is not None: result['English Address'] = { 'Estate': eng_premises.findtext(".//EstateName", ''), 'Street': eng_premises.findtext(".//StreetName", ''), 'Building No': eng_premises.findtext(".//BuildingNoFrom", ''), 'District': eng_premises.findtext(".//DcDistrict", ''), 'Region': eng_premises.findtext(".//Region", '') } chi_premises = root.find(".//ChiPremisesAddress") if chi_premises is not None: result['Chinese Address'] = { 'Estate': chi_premises.findtext(".//EstateName", ''), 'Street': chi_premises.findtext(".//StreetName", ''), 'Building No': chi_premises.findtext(".//BuildingNoFrom", ''), 'District': chi_premises.findtext(".//DcDistrict", ''), 'Region': chi_premises.findtext(".//Region", '') } geo_info = root.find(".//GeospatialInformation") if geo_info is not None: result['Geospatial Information'] = { 'Latitude': geo_info.findtext(".//Latitude", ''), 'Longitude': geo_info.findtext(".//Longitude", ''), 'Northing': geo_info.findtext(".//Northing", ''), 'Easting': geo_info.findtext(".//Easting", '') } return result except requests.RequestException as e: return f"Error fetching data from ALS API: {e}" def extract_building_from_address(user_input): normalized_input = normalize_text(user_input) match = re.match(r'([^,]+)', normalized_input) return match.group(1).strip() if match else normalized_input def address_search(user_inputs): results = [] user_inputs_list = user_inputs.splitlines() def process_input(user_input): building_part = extract_building_from_address(user_input) normalized_input = normalize_address(building_part) best_match, similarity_score = match_address(normalized_input, address_pool) als_result = fetch_address_from_als_api(best_match) if best_match else "No match found." result_str = f"Best match: {best_match} (Similarity: {similarity_score:.2f})\n" if isinstance(als_result, dict): for address_type, details in als_result.items(): result_str += f"\n{address_type}:\n" for key, value in details.items(): result_str += f"{key}: {value}\n" else: result_str += als_result return result_str with ThreadPoolExecutor() as executor: results = list(executor.map(process_input, user_inputs_list)) return "\n\n".join(results) def clean_area_data(area_data): cleaned_area_data = {} for region, districts in area_data.items(): cleaned_districts = {} for district, subdistricts in districts.items(): valid_subdistricts = [normalize_text(name) for name in subdistricts if not re.search(r'Non-Building|Invalid|Other', name, re.I)] cleaned_districts[normalize_text(district)] = valid_subdistricts cleaned_area_data[normalize_text(region)] = cleaned_districts return cleaned_area_data cleaned_area_data = clean_area_data(areaData) file_paths = [ 'EngBuilding.txt', 'EngEstate.txt', 'EngStreet.txt', 'EngVillage.txt' ] address_pool = load_and_normalize_address_pool(file_paths) interface = gr.Interface( fn=address_search, inputs=gr.Textbox(label="Enter Addresses (one per line, allow Batch Processing)", lines=10), outputs=gr.Textbox(label="ALS API Results"), title="Address Lookup and Matching (English)", description="Enter addresses to find the closest matches and fetch details from the ALS API." ) interface.launch()