Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- __pycache__/content.cpython-39.pyc +0 -0
- annotation/.DS_Store +0 -0
- annotation/src/__pycache__/utils.cpython-39.pyc +0 -0
- annotation/src/utils.py +186 -0
- app.py +197 -0
- content.py +70 -0
- database/.DS_Store +0 -0
- database/accommodations/.DS_Store +0 -0
- database/accommodations/clean_accommodations_2022.csv +0 -0
- database/attractions/attractions.csv +0 -0
- database/background/attractions.csv +0 -0
- database/background/citySet.txt +311 -0
- database/background/citySet_with_states.txt +312 -0
- database/background/clean_data.py +14 -0
- database/background/get_state_set.py +22 -0
- database/background/stateSet.txt +65 -0
- database/background/test.py +8 -0
- database/flights/.DS_Store +0 -0
- database/flights/clean_Flights_2022.csv +3 -0
- database/googleDistanceMatrix/clean_data.py +17 -0
- database/googleDistanceMatrix/distance.csv +0 -0
- database/googleDistanceMatrix/distance_org.csv +0 -0
- database/restaurants/.DS_Store +0 -0
- database/restaurants/clean_restaurant_2022.csv +0 -0
- evaluation/.DS_Store +0 -0
- evaluation/__pycache__/commonsenseConstraint.cpython-39.pyc +0 -0
- evaluation/__pycache__/eval.cpython-39.pyc +0 -0
- evaluation/__pycache__/hardConstraint.cpython-39.pyc +0 -0
- evaluation/commonsenseConstraint.py +735 -0
- evaluation/eval.py +181 -0
- evaluation/hardConstraint.py +266 -0
- evaluation/scored/1_validation_two-stage_1.jsonl +1 -0
- evaluation/scored/textbox_validation_two-stage_1.jsonl +1 -0
- requirements.txt +3 -0
- tools/__init__.py +0 -0
- tools/__pycache__/__init__.cpython-39.pyc +0 -0
- tools/accommodations/.ipynb_checkpoints/test-checkpoint.ipynb +0 -0
- tools/accommodations/__init__.py +0 -0
- tools/accommodations/__pycache__/__init__.cpython-39.pyc +0 -0
- tools/accommodations/__pycache__/apis.cpython-39.pyc +0 -0
- tools/accommodations/apis.py +91 -0
- tools/accommodations/test.ipynb +2037 -0
- tools/accommodations/test.py +12 -0
- tools/attractions/__pycache__/apis.cpython-39.pyc +0 -0
- tools/attractions/apis.py +34 -0
- tools/attractions/test.py +17 -0
- tools/cities/__pycache__/apis.cpython-39.pyc +0 -0
- tools/cities/apis.py +23 -0
- tools/cities/test.py +0 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
database/flights/clean_Flights_2022.csv filter=lfs diff=lfs merge=lfs -text
|
__pycache__/content.cpython-39.pyc
ADDED
Binary file (4.84 kB). View file
|
|
annotation/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
annotation/src/__pycache__/utils.cpython-39.pyc
ADDED
Binary file (6.95 kB). View file
|
|
annotation/src/utils.py
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import re
|
3 |
+
import os
|
4 |
+
import gradio as gr
|
5 |
+
|
6 |
+
def load_line_json_data(filename):
|
7 |
+
data = []
|
8 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
9 |
+
for line in f.read().strip().split('\n'):
|
10 |
+
unit = json.loads(line)
|
11 |
+
data.append(unit)
|
12 |
+
return data
|
13 |
+
|
14 |
+
def extract_query_number(query_string):
|
15 |
+
"""
|
16 |
+
Extract the number from a query string formatted as "Query X" or "Query X --- Done".
|
17 |
+
|
18 |
+
Args:
|
19 |
+
- query_string (str): The input string.
|
20 |
+
|
21 |
+
Returns:
|
22 |
+
- int: The extracted number if found, else None.
|
23 |
+
"""
|
24 |
+
pattern = r"Query (\d+)"
|
25 |
+
match = re.search(pattern, query_string)
|
26 |
+
return int(match.group(1)) if match else None
|
27 |
+
|
28 |
+
def create_data_display(css_content,data,annotation_idx):
|
29 |
+
return f"""
|
30 |
+
<style>
|
31 |
+
{css_content}
|
32 |
+
</style>
|
33 |
+
<div>
|
34 |
+
<span class="query-highlighted"><strong>Query {annotation_idx}:</strong> {data[annotation_idx-1]['query']}</span><br>
|
35 |
+
<span class="highlighted"><strong>Day:</strong> {data[annotation_idx-1]['days']}</span>
|
36 |
+
<span class="highlighted"><strong>Visiting City Number:</strong> {data[annotation_idx-1]['visiting_city_number']}</span>
|
37 |
+
<span class="highlighted"><strong>Date:</strong> {data[annotation_idx-1]['date']}</span>
|
38 |
+
<span class="highlighted"><strong>Departure:</strong> {data[annotation_idx-1]['org']}</span>
|
39 |
+
<span class="highlighted"><strong>Destination:</strong> {data[annotation_idx-1]['dest']}</span><br>
|
40 |
+
<span class="highlighted-alt"><strong>People Number:</strong> {data[annotation_idx-1]['people_number']}</span>
|
41 |
+
<span class="highlighted-alt"><strong>Budget:</strong> {data[annotation_idx-1]['budget']}</span>
|
42 |
+
<span class="highlighted-alt"><strong>Hotel Rule:</strong> {data[annotation_idx-1]['local_constraint']['house rule']}</span>
|
43 |
+
<span class="highlighted-alt"><strong>Cuisine:</strong> {data[annotation_idx-1]['local_constraint']['cuisine']}</span>
|
44 |
+
<span class="highlighted-alt"><strong>Room Type:</strong> {data[annotation_idx-1]['local_constraint']['room type']}</span>
|
45 |
+
<span class="highlighted-alt"><strong>Transportation:</strong> {data[annotation_idx-1]['local_constraint']['transportation']}</span><br>
|
46 |
+
</div>
|
47 |
+
"""
|
48 |
+
|
49 |
+
def judge_valid_info(info):
|
50 |
+
if info == "" or not info or info == "You don't need to fill in the information for this or later days." :
|
51 |
+
return False
|
52 |
+
return True
|
53 |
+
|
54 |
+
def judge_submit_info(info, current_day, label, annotation_data, *tested_data):
|
55 |
+
if info == "" or not info:
|
56 |
+
raise gr.Error("Day {} {} is empty!".format(current_day, label))
|
57 |
+
if info != "-":
|
58 |
+
if label == "transportation":
|
59 |
+
if not judge_valid_transportation(info, annotation_data):
|
60 |
+
raise gr.Error("Day {} {} is invalid! Please note the transportation.".format(current_day, label))
|
61 |
+
elif label == "accommodation":
|
62 |
+
if not judge_valid_room_type(info, annotation_data, tested_data[0]):
|
63 |
+
raise gr.Error("Day {} {} is invalid! Please note the room type.".format(current_day, label))
|
64 |
+
|
65 |
+
if not judge_valid_room_rule(info, annotation_data, tested_data[0]):
|
66 |
+
raise gr.Error("Day {} {} is invalid! Please note the house rules.".format(current_day, label))
|
67 |
+
|
68 |
+
return True
|
69 |
+
|
70 |
+
|
71 |
+
def judge_valid_transportation(info, annotation_data):
|
72 |
+
if annotation_data['local_constraint']['transportation'] == 'no flight' and 'Flight' in info:
|
73 |
+
return False
|
74 |
+
elif annotation_data['local_constraint']['transportation'] == 'no self-driving' and 'Self-driving' in info:
|
75 |
+
return False
|
76 |
+
return True
|
77 |
+
|
78 |
+
def judge_valid_room_type(info, annotation_data, accommodation_data_all):
|
79 |
+
accommodation_data_filtered = get_filtered_data(info, accommodation_data_all)
|
80 |
+
if annotation_data['local_constraint']['room type'] == 'not shared room' and accommodation_data_filtered['room type'].values[0] == 'Shared room':
|
81 |
+
return False
|
82 |
+
# "shared room", "not shared room", "private room", "entire room"
|
83 |
+
elif annotation_data['local_constraint']['room type'] == 'shared room' and accommodation_data_filtered['room type'].values[0] != 'Shared room':
|
84 |
+
return False
|
85 |
+
|
86 |
+
elif annotation_data['local_constraint']['room type'] == 'private room' and accommodation_data_filtered['room type'].values[0] != 'Private room':
|
87 |
+
return False
|
88 |
+
|
89 |
+
elif annotation_data['local_constraint']['room type'] == 'entire room' and accommodation_data_filtered['room type'].values[0] != 'Entire home/apt':
|
90 |
+
return False
|
91 |
+
|
92 |
+
return True
|
93 |
+
|
94 |
+
def judge_valid_room_rule(info, annotation_data, accommodation_data_all):
|
95 |
+
accommodation_data_filtered = get_filtered_data(info, accommodation_data_all)
|
96 |
+
if annotation_data['local_constraint']['house rule'] == 'smoking' and 'No smoking' in str(accommodation_data_filtered['house_rules'].values[0]):
|
97 |
+
return False
|
98 |
+
if annotation_data['local_constraint']['house rule'] == 'parities' and 'No parties' in str(accommodation_data_filtered['house_rules'].values[0]):
|
99 |
+
return False
|
100 |
+
if annotation_data['local_constraint']['house rule'] == 'children under 10' and 'No children under 10' in str(accommodation_data_filtered['house_rules'].values[0]):
|
101 |
+
return False
|
102 |
+
if annotation_data['local_constraint']['house rule'] == 'visitors' and 'No visitors' in str(accommodation_data_filtered['house_rules'].values[0]):
|
103 |
+
return False
|
104 |
+
if annotation_data['local_constraint']['house rule'] == 'pets' and 'No pets' in str(accommodation_data_filtered['house_rules'].values[0]):
|
105 |
+
return False
|
106 |
+
|
107 |
+
return True
|
108 |
+
|
109 |
+
def judge_valid_cuisine(info, annotation_data, restaurant_data_all, cuisine_set: set):
|
110 |
+
if info != "-" and annotation_data['local_constraint']['cuisine'] is not None and annotation_data['org'] not in info:
|
111 |
+
restaurant_data_filtered = get_filtered_data(info, restaurant_data_all,('Name','City'))
|
112 |
+
for cuisine in annotation_data['local_constraint']['cuisine']:
|
113 |
+
if cuisine in restaurant_data_filtered.iloc[0]['Cuisines']:
|
114 |
+
cuisine_set.add(cuisine)
|
115 |
+
return cuisine_set
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
|
120 |
+
def get_valid_name_city(info):
|
121 |
+
# Modified the pattern to preserve spaces at the end of the name
|
122 |
+
pattern = r'(.*?),\s*([^,]+)(\(\w[\w\s]*\))?$'
|
123 |
+
match = re.search(pattern, info)
|
124 |
+
if match:
|
125 |
+
return match.group(1).strip(), extract_before_parenthesis(match.group(2).strip()).strip()
|
126 |
+
else:
|
127 |
+
print(f"{info} can not be parsed, '-' will be used instead.")
|
128 |
+
return "-","-"
|
129 |
+
|
130 |
+
|
131 |
+
def extract_numbers_from_filenames(directory):
|
132 |
+
# Define the pattern to match files
|
133 |
+
pattern = r'annotation_(\d+).json'
|
134 |
+
|
135 |
+
# List all files in the directory
|
136 |
+
files = os.listdir(directory)
|
137 |
+
|
138 |
+
# Extract numbers from filenames that match the pattern
|
139 |
+
numbers = [int(re.search(pattern, file).group(1)) for file in files if re.match(pattern, file)]
|
140 |
+
|
141 |
+
return numbers
|
142 |
+
|
143 |
+
def get_city_list(days, deparure_city, destination):
|
144 |
+
city_list = []
|
145 |
+
city_list.append(deparure_city)
|
146 |
+
if days == 3:
|
147 |
+
city_list.append(destination)
|
148 |
+
else:
|
149 |
+
city_set = open('../database/background/citySet_with_states.txt').read().split('\n')
|
150 |
+
state_city_map = {}
|
151 |
+
for unit in city_set:
|
152 |
+
city, state = unit.split('\t')
|
153 |
+
if state not in state_city_map:
|
154 |
+
state_city_map[state] = []
|
155 |
+
state_city_map[state].append(city)
|
156 |
+
for city in state_city_map[destination]:
|
157 |
+
if city != deparure_city:
|
158 |
+
city_list.append(city + f"({destination})")
|
159 |
+
return city_list
|
160 |
+
|
161 |
+
def get_filtered_data(component,data, column_name=('NAME','city')):
|
162 |
+
name, city = get_valid_name_city(component)
|
163 |
+
return data[(data[column_name[0]] == name) & (data[column_name[1]] == city)]
|
164 |
+
|
165 |
+
def extract_before_parenthesis(s):
|
166 |
+
match = re.search(r'^(.*?)\([^)]*\)', s)
|
167 |
+
return match.group(1) if match else s
|
168 |
+
|
169 |
+
def count_consecutive_values(lst):
|
170 |
+
if not lst:
|
171 |
+
return []
|
172 |
+
|
173 |
+
result = []
|
174 |
+
current_string = lst[0]
|
175 |
+
count = 1
|
176 |
+
|
177 |
+
for i in range(1, len(lst)):
|
178 |
+
if lst[i] == current_string:
|
179 |
+
count += 1
|
180 |
+
else:
|
181 |
+
result.append((current_string, count))
|
182 |
+
current_string = lst[i]
|
183 |
+
count = 1
|
184 |
+
|
185 |
+
result.append((current_string, count)) # Add the last group of values
|
186 |
+
return result
|
app.py
ADDED
@@ -0,0 +1,197 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard/evaluation")))
|
4 |
+
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "./leaderboard")))
|
5 |
+
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
6 |
+
import json
|
7 |
+
import datetime
|
8 |
+
from email.utils import parseaddr
|
9 |
+
|
10 |
+
import gradio as gr
|
11 |
+
import pandas as pd
|
12 |
+
import numpy as np
|
13 |
+
|
14 |
+
from datasets import load_dataset
|
15 |
+
from apscheduler.schedulers.background import BackgroundScheduler
|
16 |
+
from huggingface_hub import HfApi
|
17 |
+
|
18 |
+
# InfoStrings
|
19 |
+
# from scorer import question_scorer
|
20 |
+
from content import format_error, format_warning, format_log, TITLE, INTRODUCTION_TEXT, CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, model_hyperlink
|
21 |
+
from evaluation.eval import eval_score
|
22 |
+
|
23 |
+
TOKEN = os.environ.get("TOKEN", None)
|
24 |
+
|
25 |
+
OWNER="osunlp"
|
26 |
+
DATA_DATASET = f"{OWNER}/TravelBench"
|
27 |
+
EVAL_DATASET = f"{OWNER}/TravelBenchEval"
|
28 |
+
|
29 |
+
api = HfApi()
|
30 |
+
|
31 |
+
YEAR_VERSION = "2024"
|
32 |
+
|
33 |
+
os.makedirs("scored", exist_ok=True)
|
34 |
+
|
35 |
+
# # Display the results
|
36 |
+
eval_results = load_dataset(EVAL_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
37 |
+
def get_dataframe_from_results(eval_results, split):
|
38 |
+
local_df = eval_results[split]
|
39 |
+
local_df = local_df.remove_columns(["Mail"])
|
40 |
+
df = pd.DataFrame(local_df)
|
41 |
+
df = df.sort_values(by=["Final Pass Rate"], ascending=False)
|
42 |
+
numeric_cols = [c for c in local_df.column_names if "Rate" in c]
|
43 |
+
df[numeric_cols] = df[numeric_cols].multiply(100).round(decimals=2)
|
44 |
+
return df
|
45 |
+
|
46 |
+
|
47 |
+
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
|
48 |
+
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
# def restart_space():
|
53 |
+
# api.restart_space(repo_id=LEADERBOARD_PATH, token=TOKEN)
|
54 |
+
|
55 |
+
|
56 |
+
def load_line_json_data(filename):
|
57 |
+
data = []
|
58 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
59 |
+
for line in f.read().strip().split('\n'):
|
60 |
+
unit = json.loads(line)
|
61 |
+
data.append(unit)
|
62 |
+
return data
|
63 |
+
|
64 |
+
|
65 |
+
def add_new_eval(
|
66 |
+
val_or_test: str,
|
67 |
+
eval_mode: str,
|
68 |
+
model: str,
|
69 |
+
planning_strategy: str,
|
70 |
+
organization: str,
|
71 |
+
mail: str,
|
72 |
+
path_to_file: str,
|
73 |
+
):
|
74 |
+
# Very basic email parsing
|
75 |
+
_, parsed_mail = parseaddr(mail)
|
76 |
+
if not "@" in parsed_mail:
|
77 |
+
return format_warning("Please provide a valid email adress.")
|
78 |
+
|
79 |
+
print("Adding new eval")
|
80 |
+
|
81 |
+
if path_to_file is None:
|
82 |
+
return format_warning("Please attach a file.")
|
83 |
+
|
84 |
+
# Save submitted file
|
85 |
+
api.upload_file(
|
86 |
+
repo_id=EVAL_DATASET,
|
87 |
+
path_or_fileobj=path_to_file.name,
|
88 |
+
path_in_repo=f"{organization}/{val_or_test}_{eval_mode}_{planning_strategy}_raw_{datetime.datetime.today()}.jsonl",
|
89 |
+
repo_type="dataset",
|
90 |
+
token=TOKEN
|
91 |
+
)
|
92 |
+
|
93 |
+
# Compute score
|
94 |
+
file_path = path_to_file.name
|
95 |
+
result = eval_score(val_or_test,file_path=file_path,TOKEN=TOKEN)
|
96 |
+
with open(f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl", "w") as scored_file:
|
97 |
+
scored_file.write(json.dumps(result) + "\n")
|
98 |
+
|
99 |
+
# Save scored file
|
100 |
+
api.upload_file(
|
101 |
+
repo_id=EVAL_DATASET,
|
102 |
+
path_or_fileobj=f"scored/{organization}_{val_or_test}_{eval_mode}_{planning_strategy}.jsonl",
|
103 |
+
path_in_repo=f"{organization}/{model}/{val_or_test}_{eval_mode}_{planning_strategy}_scored_{datetime.datetime.today()}.jsonl",
|
104 |
+
repo_type="dataset",
|
105 |
+
token=TOKEN
|
106 |
+
)
|
107 |
+
|
108 |
+
# Actual submission
|
109 |
+
eval_entry = {
|
110 |
+
"Model": model,
|
111 |
+
"Planning Strategy": planning_strategy,
|
112 |
+
"Organization": organization,
|
113 |
+
"Mail": mail,
|
114 |
+
"Delivery Rate": result['Delivery Rate'],
|
115 |
+
"Commonsense Constraint Micro Pass Rate":result['Commonsense Constraint Micro Pass Rate'],
|
116 |
+
"Commonsense Constraint Macro Pass Rate":result['Commonsense Constraint Macro Pass Rate'],
|
117 |
+
"Hard Constraint Micro Pass Rate":result['Hard Constraint Micro Pass Rate'],
|
118 |
+
"Hard Constraint Macro Pass Rate":result['Hard Constraint Macro Pass Rate'],
|
119 |
+
"Final Pass Rate":result['Final Pass Rate']
|
120 |
+
}
|
121 |
+
|
122 |
+
eval_results[val_or_test] = eval_results[val_or_test].add_item(eval_entry)
|
123 |
+
|
124 |
+
print(eval_results)
|
125 |
+
|
126 |
+
eval_results.push_to_hub(EVAL_DATASET, config_name = 'scores', token=TOKEN)
|
127 |
+
|
128 |
+
return format_log(f"Model {model} submitted by {organization} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed")
|
129 |
+
|
130 |
+
|
131 |
+
def refresh():
|
132 |
+
eval_results = load_dataset(EVAL_DATASET, 'scores', token=TOKEN, download_mode="force_redownload", ignore_verifications=True)
|
133 |
+
eval_dataframe_val = get_dataframe_from_results(eval_results=eval_results, split="validation")
|
134 |
+
eval_dataframe_test = get_dataframe_from_results(eval_results=eval_results, split="test")
|
135 |
+
return eval_dataframe_val, eval_dataframe_test
|
136 |
+
|
137 |
+
# def upload_file(files):
|
138 |
+
# file_paths = [file.name for file in files]
|
139 |
+
# return file_paths
|
140 |
+
|
141 |
+
|
142 |
+
demo = gr.Blocks()
|
143 |
+
with demo:
|
144 |
+
gr.HTML(TITLE)
|
145 |
+
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
146 |
+
|
147 |
+
with gr.Tab("Results: Validation"):
|
148 |
+
leaderboard_table_val = gr.components.Dataframe(
|
149 |
+
value=eval_dataframe_val, interactive=False,
|
150 |
+
)
|
151 |
+
with gr.Tab("Results: Test"):
|
152 |
+
leaderboard_table_test = gr.components.Dataframe(
|
153 |
+
value=eval_dataframe_test, interactive=False,
|
154 |
+
)
|
155 |
+
|
156 |
+
refresh_button = gr.Button("Refresh")
|
157 |
+
refresh_button.click(
|
158 |
+
refresh,
|
159 |
+
inputs=[],
|
160 |
+
outputs=[
|
161 |
+
leaderboard_table_val,
|
162 |
+
leaderboard_table_test,
|
163 |
+
],
|
164 |
+
)
|
165 |
+
with gr.Accordion("Submit a new file for evaluation"):
|
166 |
+
with gr.Row():
|
167 |
+
with gr.Column():
|
168 |
+
level_of_test = gr.Radio(["validation", "test"], value="validation", label="Split")
|
169 |
+
eval_mode = gr.Radio(["two-stage", "sole-planning"], value="two-stage", label="Eval Mode")
|
170 |
+
model = gr.Textbox(label="Foundation Model")
|
171 |
+
planning_strategy = gr.Textbox(label="Planning Strategy")
|
172 |
+
with gr.Column():
|
173 |
+
organization = gr.Textbox(label="Organization")
|
174 |
+
mail = gr.Textbox(label="Contact email")
|
175 |
+
file_output = gr.File()
|
176 |
+
|
177 |
+
|
178 |
+
submit_button = gr.Button("Submit Eval")
|
179 |
+
submission_result = gr.Markdown()
|
180 |
+
submit_button.click(
|
181 |
+
add_new_eval,
|
182 |
+
[
|
183 |
+
level_of_test,
|
184 |
+
eval_mode,
|
185 |
+
model,
|
186 |
+
planning_strategy,
|
187 |
+
organization,
|
188 |
+
mail,
|
189 |
+
file_output,
|
190 |
+
],
|
191 |
+
submission_result,
|
192 |
+
)
|
193 |
+
|
194 |
+
# scheduler = BackgroundScheduler()
|
195 |
+
# scheduler.add_job(restart_space, "interval", seconds=3600)
|
196 |
+
# scheduler.start()
|
197 |
+
demo.launch(debug=True)
|
content.py
ADDED
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TITLE = """<h1 align="center" id="space-title">TravelBench Leaderboard</h1>"""
|
2 |
+
|
3 |
+
INTRODUCTION_TEXT = """
|
4 |
+
TravelBench is a benchmark crafted for evaluating language agents in tool-use and complex planning within multiple constraints. (See our [paper](https://arxiv.org/abs/2311.12983) for more details.)
|
5 |
+
|
6 |
+
## Data
|
7 |
+
In TravelBench, for a given query, language agents are expected to formulate a comprehensive plan that includes transportation, daily meals, attractions, and accommodation for each day.
|
8 |
+
For constraints, from the perspective of real world applications, we design three types of them: Environment Constraint, Commonsense Constraint, and Hard Constraint.
|
9 |
+
TravelBench comprises 1,225 queries in total. The number of days and hard constraints are designed to test agents' abilities across both the breadth and depth of complex planning.
|
10 |
+
|
11 |
+
TravelBench data can be found in [this dataset](https://huggingface.co/datasets/osunlp/TravelBench).
|
12 |
+
|
13 |
+
## Submission Guidelines for TravelBench
|
14 |
+
Participants are invited to submit results for both validation and testing phases. The submissions will be evaluated based on several metrics: delivery rate, commonsense constraint pass rate (micro/macro), hard constraint pass rate (micro/macro), and the final pass rate.
|
15 |
+
|
16 |
+
### Format of Submission:
|
17 |
+
Submissions must be in the form of a JSON-line file. Each line should adhere to the following structure:
|
18 |
+
```
|
19 |
+
{"idx":0,"query":"Natural Language Query","plan":[{"day": 1, "current_city": "from [City A] to [City B]", "transportation": "Flight Number: XXX, from A to B", "breakfast": "Name, City", "attraction": "Name, City;Name, City;...;Name, City;", "lunch": "Name, City", "dinner": "Name, City", "accommodation": "Name, City"}, {"day": 2, "current_city": "City B", "transportation": "-", "breakfast": "Name, City", "attraction": "Name, City;Name, City;", "lunch": "Name, City", "dinner": "Name, City", "accommodation": "Name, City"}, ...]}
|
20 |
+
```
|
21 |
+
Explanation of Fields:
|
22 |
+
#### day:
|
23 |
+
Description: Indicates the specific day in the itinerary.
|
24 |
+
Format: Enter the numerical value representing the sequence of the day within the travel plan. For instance, '1' for the first day, '2' for the second day, and so on.
|
25 |
+
|
26 |
+
#### current city:
|
27 |
+
Description: Indicates the city where the traveler is currently located.
|
28 |
+
Format: When there is a change in location, use "from [City A] to [City B]" to denote the transition. If remaining in the same city, simply use the city's name (e.g., "City A").
|
29 |
+
|
30 |
+
#### transportation:
|
31 |
+
Description: Specifies the mode of transportation used.
|
32 |
+
Format: For flights, include the details in the format "Flight Number: XXX, from [City A] to [City B]". For self-driven or taxi travel, use "self-driving/taxi, from [City A] to [City B]". If there is no travel between cities on that day, use "-".
|
33 |
+
|
34 |
+
#### breakfast, lunch, and dinner:
|
35 |
+
Description: Details about dining arrangements.
|
36 |
+
Format: Use "Name, City" to specify the chosen restaurant and its location. If a meal is not planned, use "-".
|
37 |
+
|
38 |
+
#### attraction:
|
39 |
+
Description: Information about attractions visited.
|
40 |
+
Format: List attractions as "Name, City". If visiting multiple attractions, separate them with a semicolon ";". If no attraction is planned, use "-".
|
41 |
+
|
42 |
+
Please refer to [this](https://huggingface.co/datasets/osunlp/TravelBench/resolve/main/example_submission.jsonl?download=true) for example submission file.
|
43 |
+
|
44 |
+
Submission made by our team are labelled "TravelBench authors". Each submission will be automatically evaluated and scored based on the predefined metrics. The scores and rankings will be updated and displayed on the leaderboard.
|
45 |
+
|
46 |
+
"""
|
47 |
+
|
48 |
+
CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
|
49 |
+
CITATION_BUTTON_TEXT = r"""@misc{Xie2024TravelBench,
|
50 |
+
title={},
|
51 |
+
author={},
|
52 |
+
year={2024},
|
53 |
+
eprint={,
|
54 |
+
archivePrefix={arXiv},
|
55 |
+
primaryClass={cs.CL}
|
56 |
+
}"""
|
57 |
+
|
58 |
+
|
59 |
+
def format_error(msg):
|
60 |
+
return f"<p style='color: red; font-size: 20px; text-align: center;'>{msg}</p>"
|
61 |
+
|
62 |
+
def format_warning(msg):
|
63 |
+
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{msg}</p>"
|
64 |
+
|
65 |
+
def format_log(msg):
|
66 |
+
return f"<p style='color: green; font-size: 20px; text-align: center;'>{msg}</p>"
|
67 |
+
|
68 |
+
def model_hyperlink(link, model_name):
|
69 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
70 |
+
|
database/.DS_Store
ADDED
Binary file (8.2 kB). View file
|
|
database/accommodations/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
database/accommodations/clean_accommodations_2022.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
database/attractions/attractions.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
database/background/attractions.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
database/background/citySet.txt
ADDED
@@ -0,0 +1,311 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
San Diego
|
2 |
+
Pellston
|
3 |
+
Buffalo
|
4 |
+
Charlotte Amalie
|
5 |
+
Flagstaff
|
6 |
+
Evansville
|
7 |
+
Hilo
|
8 |
+
Twin Falls
|
9 |
+
Newark
|
10 |
+
State College
|
11 |
+
Johnstown
|
12 |
+
Montgomery
|
13 |
+
Redding
|
14 |
+
Lynchburg
|
15 |
+
South Bend
|
16 |
+
Sarasota
|
17 |
+
Sioux Falls
|
18 |
+
Paducah
|
19 |
+
Kahului
|
20 |
+
Atlantic City
|
21 |
+
Bemidji
|
22 |
+
Toledo
|
23 |
+
Abilene
|
24 |
+
Sacramento
|
25 |
+
Amarillo
|
26 |
+
Moline
|
27 |
+
Hilton Head
|
28 |
+
Manhattan
|
29 |
+
Minneapolis
|
30 |
+
Fort Myers
|
31 |
+
Roswell
|
32 |
+
Harlingen
|
33 |
+
Seattle
|
34 |
+
Manchester
|
35 |
+
Gulfport
|
36 |
+
Gainesville
|
37 |
+
Pago Pago
|
38 |
+
Wrangell
|
39 |
+
Augusta
|
40 |
+
Waterloo
|
41 |
+
Yuma
|
42 |
+
Saipan
|
43 |
+
Christiansted
|
44 |
+
North Bend
|
45 |
+
Richmond
|
46 |
+
Albuquerque
|
47 |
+
Nashville
|
48 |
+
Aberdeen
|
49 |
+
Harrisburg
|
50 |
+
Fort Wayne
|
51 |
+
Green Bay
|
52 |
+
Wenatchee
|
53 |
+
Santa Fe
|
54 |
+
St. Petersburg
|
55 |
+
Belleville
|
56 |
+
Greensboro
|
57 |
+
Lake Charles
|
58 |
+
Traverse City
|
59 |
+
Erie
|
60 |
+
Niagara Falls
|
61 |
+
Pocatello
|
62 |
+
Idaho Falls
|
63 |
+
Alpena
|
64 |
+
Wilmington
|
65 |
+
Ontario
|
66 |
+
Iron Mountain
|
67 |
+
Lubbock
|
68 |
+
Helena
|
69 |
+
Kalamazoo
|
70 |
+
Cleveland
|
71 |
+
Grand Island
|
72 |
+
New Bern
|
73 |
+
Melbourne
|
74 |
+
Bristol
|
75 |
+
Orlando
|
76 |
+
Bismarck
|
77 |
+
Fresno
|
78 |
+
Billings
|
79 |
+
Daytona Beach
|
80 |
+
College Station
|
81 |
+
Jacksonville
|
82 |
+
Salt Lake City
|
83 |
+
Corpus Christi
|
84 |
+
Florence
|
85 |
+
Moab
|
86 |
+
Grand Forks
|
87 |
+
Las Vegas
|
88 |
+
Fairbanks
|
89 |
+
Petersburg
|
90 |
+
Wichita
|
91 |
+
Rhinelander
|
92 |
+
Kansas City
|
93 |
+
Dothan
|
94 |
+
Alamosa
|
95 |
+
Adak Island
|
96 |
+
Islip
|
97 |
+
Wichita Falls
|
98 |
+
Presque Isle
|
99 |
+
San Luis Obispo
|
100 |
+
Dayton
|
101 |
+
Fort Smith
|
102 |
+
Martha's Vineyard
|
103 |
+
Portland
|
104 |
+
Waco
|
105 |
+
New York
|
106 |
+
Columbus
|
107 |
+
Tampa
|
108 |
+
Little Rock
|
109 |
+
Kona
|
110 |
+
Clarksburg
|
111 |
+
San Angelo
|
112 |
+
Saginaw
|
113 |
+
Houston
|
114 |
+
Duluth
|
115 |
+
Valparaiso
|
116 |
+
Phoenix
|
117 |
+
Oakland
|
118 |
+
Watertown
|
119 |
+
Ogden
|
120 |
+
Cedar Rapids
|
121 |
+
Cape Girardeau
|
122 |
+
Sun Valley
|
123 |
+
Sault Ste. Marie
|
124 |
+
Trenton
|
125 |
+
Missoula
|
126 |
+
Pasco
|
127 |
+
Brainerd
|
128 |
+
Newburgh
|
129 |
+
Gustavus
|
130 |
+
Branson
|
131 |
+
Providence
|
132 |
+
Minot
|
133 |
+
Huntsville
|
134 |
+
San Antonio
|
135 |
+
Marquette
|
136 |
+
Owensboro
|
137 |
+
Del Rio
|
138 |
+
Portsmouth
|
139 |
+
Bloomington
|
140 |
+
Lexington
|
141 |
+
Santa Barbara
|
142 |
+
Baltimore
|
143 |
+
Panama City
|
144 |
+
Kodiak
|
145 |
+
Yakima
|
146 |
+
Vernal
|
147 |
+
Salisbury
|
148 |
+
Mission
|
149 |
+
Newport News
|
150 |
+
Charlottesville
|
151 |
+
Grand Junction
|
152 |
+
Baton Rouge
|
153 |
+
Beaumont
|
154 |
+
Staunton
|
155 |
+
Kalispell
|
156 |
+
Key West
|
157 |
+
Worcester
|
158 |
+
West Palm Beach
|
159 |
+
Boise
|
160 |
+
Grand Rapids
|
161 |
+
Salina
|
162 |
+
Fort Leonard Wood
|
163 |
+
Walla Walla
|
164 |
+
Everett
|
165 |
+
Dillingham
|
166 |
+
Lansing
|
167 |
+
Madison
|
168 |
+
Victoria
|
169 |
+
Sioux City
|
170 |
+
Hattiesburg
|
171 |
+
Stockton
|
172 |
+
Anchorage
|
173 |
+
Charlotte
|
174 |
+
Jamestown
|
175 |
+
Laramie
|
176 |
+
Decatur
|
177 |
+
Durango
|
178 |
+
Longview
|
179 |
+
Syracuse
|
180 |
+
St. Cloud
|
181 |
+
Santa Rosa
|
182 |
+
Bakersfield
|
183 |
+
North Platte
|
184 |
+
La Crosse
|
185 |
+
Plattsburgh
|
186 |
+
Concord
|
187 |
+
Atlanta
|
188 |
+
Provo
|
189 |
+
Ogdensburg
|
190 |
+
Ithaca
|
191 |
+
Colorado Springs
|
192 |
+
Washington
|
193 |
+
Williston
|
194 |
+
Tulsa
|
195 |
+
Midland
|
196 |
+
Champaign
|
197 |
+
Devils Lake
|
198 |
+
Greer
|
199 |
+
Muskegon
|
200 |
+
Hibbing
|
201 |
+
Santa Ana
|
202 |
+
Ponce
|
203 |
+
Prescott
|
204 |
+
Indianapolis
|
205 |
+
International Falls
|
206 |
+
Rapid City
|
207 |
+
Ketchikan
|
208 |
+
St. Louis
|
209 |
+
Santa Maria
|
210 |
+
Elmira
|
211 |
+
Alexandria
|
212 |
+
San Jose
|
213 |
+
Tucson
|
214 |
+
San Juan
|
215 |
+
Dubuque
|
216 |
+
Burbank
|
217 |
+
Gunnison
|
218 |
+
Cedar City
|
219 |
+
Hyannis
|
220 |
+
Raleigh
|
221 |
+
Norfolk
|
222 |
+
New Orleans
|
223 |
+
Medford
|
224 |
+
White Plains
|
225 |
+
Oklahoma City
|
226 |
+
Chicago
|
227 |
+
El Paso
|
228 |
+
Rockford
|
229 |
+
Aguadilla
|
230 |
+
Omaha
|
231 |
+
Scottsbluff
|
232 |
+
Yakutat
|
233 |
+
Arcata
|
234 |
+
Spokane
|
235 |
+
Brownsville
|
236 |
+
Bend
|
237 |
+
Hagerstown
|
238 |
+
Peoria
|
239 |
+
Appleton
|
240 |
+
Roanoke
|
241 |
+
Eugene
|
242 |
+
Rock Springs
|
243 |
+
Dodge City
|
244 |
+
Austin
|
245 |
+
Miami
|
246 |
+
Dallas
|
247 |
+
Mosinee
|
248 |
+
Killeen
|
249 |
+
Lihue
|
250 |
+
Pittsburgh
|
251 |
+
Tallahassee
|
252 |
+
Butte
|
253 |
+
Lawton
|
254 |
+
Honolulu
|
255 |
+
Greenville
|
256 |
+
Juneau
|
257 |
+
Myrtle Beach
|
258 |
+
Boston
|
259 |
+
Charleston
|
260 |
+
Latrobe
|
261 |
+
Knoxville
|
262 |
+
Denver
|
263 |
+
Bangor
|
264 |
+
Albany
|
265 |
+
Punta Gorda
|
266 |
+
Fort Lauderdale
|
267 |
+
Philadelphia
|
268 |
+
Binghamton
|
269 |
+
Great Falls
|
270 |
+
Shreveport
|
271 |
+
Asheville
|
272 |
+
Cheyenne
|
273 |
+
Milwaukee
|
274 |
+
Nome
|
275 |
+
Laredo
|
276 |
+
Des Moines
|
277 |
+
Fayetteville
|
278 |
+
Lewisburg
|
279 |
+
Fort Dodge
|
280 |
+
Cody
|
281 |
+
Chattanooga
|
282 |
+
Deadhorse
|
283 |
+
Kotzebue
|
284 |
+
Sitka
|
285 |
+
Bozeman
|
286 |
+
Palm Springs
|
287 |
+
Memphis
|
288 |
+
Nantucket
|
289 |
+
Texarkana
|
290 |
+
Lewiston
|
291 |
+
Valdosta
|
292 |
+
Birmingham
|
293 |
+
Scranton
|
294 |
+
Pensacola
|
295 |
+
Hancock
|
296 |
+
Los Angeles
|
297 |
+
Mason City
|
298 |
+
Savannah
|
299 |
+
West Yellowstone
|
300 |
+
Long Beach
|
301 |
+
Reno
|
302 |
+
Akron
|
303 |
+
Louisville
|
304 |
+
Hartford
|
305 |
+
Cincinnati
|
306 |
+
Rochester
|
307 |
+
San Francisco
|
308 |
+
Detroit
|
309 |
+
Monterey
|
310 |
+
Escanaba
|
311 |
+
Eau Claire
|
database/background/citySet_with_states.txt
ADDED
@@ -0,0 +1,312 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
San Diego California
|
2 |
+
Pellston Michigan
|
3 |
+
Buffalo New York
|
4 |
+
Charlotte Amalie St. Thomas
|
5 |
+
Flagstaff Arizona
|
6 |
+
Evansville Indiana
|
7 |
+
Hilo Hawaii
|
8 |
+
Twin Falls Idaho
|
9 |
+
Newark New Jersey
|
10 |
+
State College Pennsylvania
|
11 |
+
Johnstown Pennsylvania
|
12 |
+
Charleston South Carolina
|
13 |
+
Montgomery Alabama
|
14 |
+
Redding California
|
15 |
+
Lynchburg Virginia
|
16 |
+
South Bend Indiana
|
17 |
+
Sarasota Florida
|
18 |
+
Sioux Falls South Dakota
|
19 |
+
Paducah Kentucky
|
20 |
+
Kahului Hawaii
|
21 |
+
Atlantic City New Jersey
|
22 |
+
Bemidji Minnesota
|
23 |
+
Toledo Ohio
|
24 |
+
Abilene Texas
|
25 |
+
Sacramento California
|
26 |
+
Amarillo Texas
|
27 |
+
Moline Illinois
|
28 |
+
Hilton Head South Carolina
|
29 |
+
Manhattan New York
|
30 |
+
Minneapolis Minnesota
|
31 |
+
Fort Myers Florida
|
32 |
+
Roswell New Mexico
|
33 |
+
Harlingen Texas
|
34 |
+
Seattle Washington
|
35 |
+
Manchester England
|
36 |
+
Gulfport Mississippi
|
37 |
+
Gainesville Florida
|
38 |
+
Pago Pago Eastern District
|
39 |
+
Wrangell Alaska
|
40 |
+
Augusta Georgia
|
41 |
+
Waterloo Wallonia
|
42 |
+
Yuma Arizona
|
43 |
+
Saipan Saipan
|
44 |
+
Christiansted St. Croix
|
45 |
+
North Bend Oregon
|
46 |
+
Richmond Virginia
|
47 |
+
Albuquerque New Mexico
|
48 |
+
Nashville Tennessee
|
49 |
+
Aberdeen Scotland
|
50 |
+
Harrisburg Pennsylvania
|
51 |
+
Fort Wayne Indiana
|
52 |
+
Green Bay Wisconsin
|
53 |
+
Wenatchee Washington
|
54 |
+
Santa Fe New Mexico
|
55 |
+
St. Petersburg Saint Petersburg
|
56 |
+
Belleville Illinois
|
57 |
+
Greensboro North Carolina
|
58 |
+
Lake Charles Louisiana
|
59 |
+
Traverse City Michigan
|
60 |
+
Erie Pennsylvania
|
61 |
+
Niagara Falls New York
|
62 |
+
Pocatello Idaho
|
63 |
+
Idaho Falls Idaho
|
64 |
+
Alpena Michigan
|
65 |
+
Wilmington North Carolina
|
66 |
+
Ontario Ontario
|
67 |
+
Iron Mountain Michigan
|
68 |
+
Lubbock Texas
|
69 |
+
Helena Montana
|
70 |
+
Kalamazoo Michigan
|
71 |
+
Cleveland Ohio
|
72 |
+
Grand Island Nebraska
|
73 |
+
New Bern North Carolina
|
74 |
+
Melbourne Victoria
|
75 |
+
Bristol Tennessee
|
76 |
+
Orlando Florida
|
77 |
+
Bismarck North Dakota
|
78 |
+
Fresno California
|
79 |
+
Billings Montana
|
80 |
+
Jackson Mississippi
|
81 |
+
Daytona Beach Florida
|
82 |
+
College Station Texas
|
83 |
+
Jacksonville Florida
|
84 |
+
Salt Lake City Utah
|
85 |
+
Corpus Christi Texas
|
86 |
+
Florence Tuscany
|
87 |
+
Moab Utah
|
88 |
+
Grand Forks North Dakota
|
89 |
+
Las Vegas Nevada
|
90 |
+
Fairbanks Alaska
|
91 |
+
Petersburg Virginia
|
92 |
+
Wichita Kansas
|
93 |
+
Rhinelander Wisconsin
|
94 |
+
Kansas City Missouri
|
95 |
+
Dothan Alabama
|
96 |
+
Alamosa Colorado
|
97 |
+
Adak Island Alaska
|
98 |
+
Islip New York
|
99 |
+
Wichita Falls Texas
|
100 |
+
Presque Isle Maine
|
101 |
+
San Luis Obispo California
|
102 |
+
Dayton Ohio
|
103 |
+
Fort Smith Arkansas
|
104 |
+
Martha's Vineyard Massachusetts
|
105 |
+
Portland Oregon
|
106 |
+
Waco Texas
|
107 |
+
New York New York
|
108 |
+
Columbus Ohio
|
109 |
+
Tampa Florida
|
110 |
+
Little Rock Arkansas
|
111 |
+
Kona Hawaii
|
112 |
+
Clarksburg West Virginia
|
113 |
+
San Angelo Texas
|
114 |
+
Saginaw Michigan
|
115 |
+
Houston Texas
|
116 |
+
Duluth Minnesota
|
117 |
+
Valparaiso Indiana
|
118 |
+
Phoenix Arizona
|
119 |
+
Oakland California
|
120 |
+
Watertown New York
|
121 |
+
Ogden Utah
|
122 |
+
Cedar Rapids Iowa
|
123 |
+
Cape Girardeau Missouri
|
124 |
+
Sun Valley Idaho
|
125 |
+
Sault Ste. Marie Ontario
|
126 |
+
Trenton New Jersey
|
127 |
+
Missoula Montana
|
128 |
+
Pasco Washington
|
129 |
+
Brainerd Minnesota
|
130 |
+
Newburgh New York
|
131 |
+
Gustavus Minnesota
|
132 |
+
Branson Missouri
|
133 |
+
Providence Rhode Island
|
134 |
+
Minot North Dakota
|
135 |
+
Huntsville Alabama
|
136 |
+
San Antonio Texas
|
137 |
+
Marquette Wisconsin
|
138 |
+
Owensboro Kentucky
|
139 |
+
Del Rio Texas
|
140 |
+
Portsmouth England
|
141 |
+
Bloomington Illinois
|
142 |
+
Lexington Kentucky
|
143 |
+
Santa Barbara California
|
144 |
+
Baltimore Maryland
|
145 |
+
Panama City Florida
|
146 |
+
Kodiak Alaska
|
147 |
+
Yakima Washington
|
148 |
+
Vernal Utah
|
149 |
+
Salisbury Maryland
|
150 |
+
Mission Texas
|
151 |
+
Newport News Virginia
|
152 |
+
Charlottesville Virginia
|
153 |
+
Grand Junction Colorado
|
154 |
+
Baton Rouge Louisiana
|
155 |
+
Beaumont Texas
|
156 |
+
Staunton Virginia
|
157 |
+
Kalispell Montana
|
158 |
+
Key West Florida
|
159 |
+
Worcester England
|
160 |
+
West Palm Beach Florida
|
161 |
+
Boise Idaho
|
162 |
+
Grand Rapids Michigan
|
163 |
+
Salina Kansas
|
164 |
+
Fort Leonard Wood Missouri
|
165 |
+
Walla Walla Washington
|
166 |
+
Everett Washington
|
167 |
+
Dillingham Alaska
|
168 |
+
Lansing Michigan
|
169 |
+
Madison Wisconsin
|
170 |
+
Victoria Victoria
|
171 |
+
Sioux City Iowa
|
172 |
+
Hattiesburg Mississippi
|
173 |
+
Stockton California
|
174 |
+
Anchorage Alaska
|
175 |
+
Charlotte North Carolina
|
176 |
+
Jamestown Virginia
|
177 |
+
Laramie Wyoming
|
178 |
+
Decatur Georgia
|
179 |
+
Durango Colorado
|
180 |
+
Longview Texas
|
181 |
+
Syracuse New York
|
182 |
+
St. Cloud Minnesota
|
183 |
+
Santa Rosa California
|
184 |
+
Bakersfield California
|
185 |
+
North Platte Nebraska
|
186 |
+
La Crosse Wisconsin
|
187 |
+
Plattsburgh New York
|
188 |
+
Concord New Hampshire
|
189 |
+
Atlanta Georgia
|
190 |
+
Provo Utah
|
191 |
+
Ogdensburg New York
|
192 |
+
Ithaca New York
|
193 |
+
Colorado Springs Colorado
|
194 |
+
Washington District of Columbia
|
195 |
+
Williston North Dakota
|
196 |
+
Tulsa Oklahoma
|
197 |
+
Midland Texas
|
198 |
+
Champaign Illinois
|
199 |
+
Devils Lake Wisconsin
|
200 |
+
Greer South Carolina
|
201 |
+
Muskegon Michigan
|
202 |
+
Hibbing Minnesota
|
203 |
+
Santa Ana California
|
204 |
+
Ponce Ponce
|
205 |
+
Prescott Arizona
|
206 |
+
Indianapolis Indiana
|
207 |
+
International Falls Minnesota
|
208 |
+
Rapid City South Dakota
|
209 |
+
Ketchikan Alaska
|
210 |
+
St. Louis Missouri
|
211 |
+
Santa Maria California
|
212 |
+
Elmira New York
|
213 |
+
Alexandria Alexandria Governorate
|
214 |
+
San Jose California
|
215 |
+
Tucson Arizona
|
216 |
+
San Juan San Juan
|
217 |
+
Dubuque Iowa
|
218 |
+
Burbank California
|
219 |
+
Gunnison Colorado
|
220 |
+
Cedar City Utah
|
221 |
+
Hyannis Massachusetts
|
222 |
+
Raleigh North Carolina
|
223 |
+
Norfolk Virginia
|
224 |
+
New Orleans Louisiana
|
225 |
+
Medford Oregon
|
226 |
+
White Plains New York
|
227 |
+
Oklahoma City Oklahoma
|
228 |
+
Chicago Illinois
|
229 |
+
El Paso Texas
|
230 |
+
Rockford Illinois
|
231 |
+
Aguadilla Aguadilla
|
232 |
+
Omaha Nebraska
|
233 |
+
Scottsbluff Nebraska
|
234 |
+
Yakutat Alaska
|
235 |
+
Arcata California
|
236 |
+
Spokane Washington
|
237 |
+
Brownsville Texas
|
238 |
+
Bend Oregon
|
239 |
+
Hagerstown Maryland
|
240 |
+
Peoria Illinois
|
241 |
+
Appleton Wisconsin
|
242 |
+
Roanoke Virginia
|
243 |
+
Eugene Oregon
|
244 |
+
Rock Springs Wyoming
|
245 |
+
Dodge City Kansas
|
246 |
+
Austin Texas
|
247 |
+
Miami Florida
|
248 |
+
Dallas Texas
|
249 |
+
Mosinee Wisconsin
|
250 |
+
Killeen Texas
|
251 |
+
Lihue Hawaii
|
252 |
+
Pittsburgh Pennsylvania
|
253 |
+
Tallahassee Florida
|
254 |
+
Butte California
|
255 |
+
Lawton Oklahoma
|
256 |
+
Honolulu Hawaii
|
257 |
+
Greenville South Carolina
|
258 |
+
Juneau Alaska
|
259 |
+
Myrtle Beach South Carolina
|
260 |
+
Boston Massachusetts
|
261 |
+
Latrobe Pennsylvania
|
262 |
+
Knoxville Tennessee
|
263 |
+
Denver Colorado
|
264 |
+
Bangor Maine
|
265 |
+
Albany New York
|
266 |
+
Punta Gorda Florida
|
267 |
+
Fort Lauderdale Florida
|
268 |
+
Philadelphia Pennsylvania
|
269 |
+
Binghamton New York
|
270 |
+
Great Falls Montana
|
271 |
+
Shreveport Louisiana
|
272 |
+
Asheville North Carolina
|
273 |
+
Cheyenne Wyoming
|
274 |
+
Milwaukee Wisconsin
|
275 |
+
Nome Alaska
|
276 |
+
Laredo Texas
|
277 |
+
Des Moines Iowa
|
278 |
+
Fayetteville North Carolina
|
279 |
+
Lewisburg Pennsylvania
|
280 |
+
Fort Dodge Iowa
|
281 |
+
Cody Wyoming
|
282 |
+
Chattanooga Tennessee
|
283 |
+
Deadhorse Alaska
|
284 |
+
Kotzebue Alaska
|
285 |
+
Sitka Alaska
|
286 |
+
Bozeman Montana
|
287 |
+
Palm Springs California
|
288 |
+
Memphis Tennessee
|
289 |
+
Nantucket Massachusetts
|
290 |
+
Texarkana Texas
|
291 |
+
Lewiston Idaho
|
292 |
+
Valdosta Georgia
|
293 |
+
Birmingham England
|
294 |
+
Scranton Pennsylvania
|
295 |
+
Pensacola Florida
|
296 |
+
Hancock Michigan
|
297 |
+
Los Angeles California
|
298 |
+
Mason City Iowa
|
299 |
+
Savannah Georgia
|
300 |
+
West Yellowstone Montana
|
301 |
+
Long Beach California
|
302 |
+
Reno Nevada
|
303 |
+
Akron Ohio
|
304 |
+
Louisville Kentucky
|
305 |
+
Hartford Connecticut
|
306 |
+
Cincinnati Ohio
|
307 |
+
Rochester New York
|
308 |
+
San Francisco California
|
309 |
+
Detroit Michigan
|
310 |
+
Monterey California
|
311 |
+
Escanaba Michigan
|
312 |
+
Eau Claire Wisconsin
|
database/background/clean_data.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
with open('database/background/citySet.txt','r') as f:
|
2 |
+
city_set = f.read().strip().split('\n')
|
3 |
+
|
4 |
+
with open('database/background/citySet_with_states.txt','r') as f:
|
5 |
+
lines = f.read().strip().split('\n')
|
6 |
+
data = []
|
7 |
+
for unit in lines:
|
8 |
+
if unit.split('\t')[0] in city_set:
|
9 |
+
data.append(unit)
|
10 |
+
|
11 |
+
with open('database/background/citySet_with_states.txt','w') as f:
|
12 |
+
for unit in data:
|
13 |
+
f.write(unit + '\n')
|
14 |
+
f.close()
|
database/background/get_state_set.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
# print now directory
|
3 |
+
print(os.getcwd())
|
4 |
+
state_set = set()
|
5 |
+
city_set = set()
|
6 |
+
with open('database/background/citySet_with_states.txt','r') as f:
|
7 |
+
city_set = f.read().strip().split('\n')
|
8 |
+
for city in city_set:
|
9 |
+
city_name = city.split('\t')[0]
|
10 |
+
state_name = city.split('\t')[1]
|
11 |
+
state_set.add(state_name)
|
12 |
+
city_set.add(city_name)
|
13 |
+
# write to new file
|
14 |
+
f.close()
|
15 |
+
# with open('database/background/stateSet.txt', 'a') as f:
|
16 |
+
# for state_name in state_set:
|
17 |
+
# f.write(state_name.split('\\')[0] + '\n')
|
18 |
+
# f.close()
|
19 |
+
with open('database/background/citySet_2.txt', 'a') as f:
|
20 |
+
for city_name in city_set:
|
21 |
+
f.write(city_name.split('\\')[0] + '\n')
|
22 |
+
f.close()
|
database/background/stateSet.txt
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Wallonia
|
2 |
+
St. Thomas
|
3 |
+
Alaska
|
4 |
+
Washington
|
5 |
+
Kansas
|
6 |
+
Scotland
|
7 |
+
Michigan
|
8 |
+
Eastern District
|
9 |
+
New Jersey
|
10 |
+
Utah
|
11 |
+
Alexandria Governorate
|
12 |
+
North Dakota
|
13 |
+
Connecticut
|
14 |
+
West Virginia
|
15 |
+
Aguadilla
|
16 |
+
North Carolina
|
17 |
+
Ohio
|
18 |
+
Colorado
|
19 |
+
Arkansas
|
20 |
+
New York
|
21 |
+
Mississippi
|
22 |
+
San Juan
|
23 |
+
Minnesota
|
24 |
+
California
|
25 |
+
Maine
|
26 |
+
Nebraska
|
27 |
+
Idaho
|
28 |
+
Alabama
|
29 |
+
Texas
|
30 |
+
Maryland
|
31 |
+
England
|
32 |
+
New Mexico
|
33 |
+
South Carolina
|
34 |
+
Montana
|
35 |
+
Ponce
|
36 |
+
Tennessee
|
37 |
+
Florida
|
38 |
+
Oklahoma
|
39 |
+
Hawaii
|
40 |
+
New Hampshire
|
41 |
+
Iowa
|
42 |
+
Oregon
|
43 |
+
Wyoming
|
44 |
+
Pennsylvania
|
45 |
+
Tuscany
|
46 |
+
Virginia
|
47 |
+
Indiana
|
48 |
+
Missouri
|
49 |
+
District of Columbia
|
50 |
+
Saint Petersburg
|
51 |
+
Nevada
|
52 |
+
Massachusetts
|
53 |
+
Louisiana
|
54 |
+
Wisconsin
|
55 |
+
Saipan
|
56 |
+
Ontario
|
57 |
+
St. Croix
|
58 |
+
Kentucky
|
59 |
+
South Dakota
|
60 |
+
Arizona
|
61 |
+
Georgia
|
62 |
+
Rhode Island
|
63 |
+
Illinois
|
64 |
+
None
|
65 |
+
Victoria
|
database/background/test.py
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
f = open('/home/xj/toolAugEnv/code/toolConstraint/database/background/citySet.txt','r').read().strip().split('\n')
|
3 |
+
citySet = []
|
4 |
+
for line in f:
|
5 |
+
if line not in citySet:
|
6 |
+
citySet.append(line.strip())
|
7 |
+
else:
|
8 |
+
print(line)
|
database/flights/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
database/flights/clean_Flights_2022.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8dafdb0e3f8b79ce599a1e612a772865295bc226b46e5fb278368f7255b11cee
|
3 |
+
size 304807007
|
database/googleDistanceMatrix/clean_data.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import pandas as pd
|
3 |
+
import csv
|
4 |
+
|
5 |
+
def extract_before_parenthesis(s):
|
6 |
+
match = re.search(r'^(.*?)\([^)]*\)', s)
|
7 |
+
return match.group(1) if match else s
|
8 |
+
|
9 |
+
if __name__ == '__main__':
|
10 |
+
data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/googleDistanceMatrix/distance.csv')
|
11 |
+
data = data.to_dict(orient = 'split')
|
12 |
+
fieldnames = ['origin', 'destination', 'cost', 'duration', 'distance']
|
13 |
+
with open('/home/xj/toolAugEnv/code/toolConstraint/database/googleDistanceMatrix/distance2.csv', 'w', newline='') as csvfile:
|
14 |
+
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
|
15 |
+
writer.writeheader()
|
16 |
+
for row in data['data']:
|
17 |
+
writer.writerow({'origin': extract_before_parenthesis(row[0]), 'destination': extract_before_parenthesis(row[1]), 'cost': row[2], 'duration': row[3], 'distance': row[4]})
|
database/googleDistanceMatrix/distance.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
database/googleDistanceMatrix/distance_org.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
database/restaurants/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
database/restaurants/clean_restaurant_2022.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
evaluation/__pycache__/commonsenseConstraint.cpython-39.pyc
ADDED
Binary file (14 kB). View file
|
|
evaluation/__pycache__/eval.cpython-39.pyc
ADDED
Binary file (7.05 kB). View file
|
|
evaluation/__pycache__/hardConstraint.cpython-39.pyc
ADDED
Binary file (8.13 kB). View file
|
|
evaluation/commonsenseConstraint.py
ADDED
@@ -0,0 +1,735 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from annotation.src.utils import get_valid_name_city,extract_before_parenthesis,extract_numbers_from_filenames
|
2 |
+
from tools.flights.apis import Flights
|
3 |
+
from tools.accommodations.apis import Accommodations
|
4 |
+
from tools.restaurants.apis import Restaurants
|
5 |
+
from tools.googleDistanceMatrix.apis import GoogleDistanceMatrix
|
6 |
+
from tools.attractions.apis import Attractions
|
7 |
+
import math
|
8 |
+
import json
|
9 |
+
import re
|
10 |
+
import os
|
11 |
+
import sys
|
12 |
+
from tqdm import tqdm
|
13 |
+
import argparse
|
14 |
+
|
15 |
+
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
|
16 |
+
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
17 |
+
|
18 |
+
flight = Flights()
|
19 |
+
accommodation = Accommodations()
|
20 |
+
restaurants = Restaurants()
|
21 |
+
googleDistanceMatrix = GoogleDistanceMatrix()
|
22 |
+
attractions = Attractions()
|
23 |
+
|
24 |
+
city_state_set = open('../database/background/citySet_with_states.txt','r').read().split('\n')
|
25 |
+
city_state_map = {x:y for x,y in [unit.split('\t') for unit in city_state_set]}
|
26 |
+
|
27 |
+
|
28 |
+
def load_line_json_data(filename):
|
29 |
+
data = []
|
30 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
31 |
+
for line in f.read().strip().split('\n'):
|
32 |
+
unit = json.loads(line)
|
33 |
+
data.append(unit)
|
34 |
+
return data
|
35 |
+
|
36 |
+
|
37 |
+
def count_consecutive_values(lst):
|
38 |
+
if not lst:
|
39 |
+
return []
|
40 |
+
|
41 |
+
result = []
|
42 |
+
current_string = lst[0]
|
43 |
+
count = 1
|
44 |
+
|
45 |
+
for i in range(1, len(lst)):
|
46 |
+
if lst[i] == current_string:
|
47 |
+
count += 1
|
48 |
+
else:
|
49 |
+
result.append((current_string, count))
|
50 |
+
current_string = lst[i]
|
51 |
+
count = 1
|
52 |
+
|
53 |
+
result.append((current_string, count)) # Add the last group of values
|
54 |
+
return result
|
55 |
+
|
56 |
+
|
57 |
+
def transportation_match(text: str):
|
58 |
+
|
59 |
+
if 'taxi' in text.lower():
|
60 |
+
return 'Taxi'
|
61 |
+
|
62 |
+
elif 'self-driving' in text.lower():
|
63 |
+
return 'Self-driving'
|
64 |
+
|
65 |
+
elif 'flight' in text.lower():
|
66 |
+
return 'Flight'
|
67 |
+
|
68 |
+
|
69 |
+
def extract_from_to(text: str):
|
70 |
+
"""
|
71 |
+
Extracts 'A' and 'B' from the format "from A to B" in the given text, with B ending at a comma or the end of the string.
|
72 |
+
|
73 |
+
Args:
|
74 |
+
- text (str): The input string.
|
75 |
+
|
76 |
+
Returns:
|
77 |
+
- tuple: A tuple containing 'A' and 'B'. If no match is found, returns (None, None).
|
78 |
+
"""
|
79 |
+
pattern = r"from\s+(.+?)\s+to\s+([^,]+)(?=[,\s]|$)"
|
80 |
+
matches = re.search(pattern, text)
|
81 |
+
return matches.groups() if matches else (None, None)
|
82 |
+
|
83 |
+
|
84 |
+
|
85 |
+
def is_valid_city_sequence(city_list):
|
86 |
+
"""
|
87 |
+
Checks if the city sequence is valid. A valid sequence has every city (except the first and last)
|
88 |
+
appearing consecutively, and no city should appear again once its sequence is over.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
- city_list (list): List of cities.
|
92 |
+
|
93 |
+
Returns:
|
94 |
+
- bool: True if the sequence is valid, False otherwise.
|
95 |
+
"""
|
96 |
+
|
97 |
+
# If the list has less than 3 cities, it's invalid.
|
98 |
+
if len(city_list) < 3:
|
99 |
+
return False
|
100 |
+
|
101 |
+
# Set to keep track of visited cities
|
102 |
+
visited_cities = set()
|
103 |
+
|
104 |
+
i = 0
|
105 |
+
while i < len(city_list):
|
106 |
+
city = city_list[i]
|
107 |
+
|
108 |
+
# If the city was already visited, it's invalid.
|
109 |
+
if city in visited_cities and (i != 0 and i != len(city_list) - 1):
|
110 |
+
return False
|
111 |
+
|
112 |
+
# Count the consecutive occurrences of the city
|
113 |
+
count = 0
|
114 |
+
while i < len(city_list) and city_list[i] == city:
|
115 |
+
count += 1
|
116 |
+
i += 1
|
117 |
+
|
118 |
+
# If the city appeared only once in the medium, it's invalid.
|
119 |
+
if count == 1 and 0 < i - 1 < len(city_list) - 1:
|
120 |
+
return False
|
121 |
+
|
122 |
+
visited_cities.add(city)
|
123 |
+
|
124 |
+
return True
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
def is_reasonalbe_visiting_city(question, tested_data):
|
129 |
+
|
130 |
+
city_list = []
|
131 |
+
|
132 |
+
# print(tested_data)
|
133 |
+
for i in range(min(question['days'],len(tested_data))):
|
134 |
+
city_value = tested_data[i]['current_city']
|
135 |
+
|
136 |
+
if 'from' in city_value:
|
137 |
+
city1, city2 = extract_from_to(city_value)
|
138 |
+
city1 = extract_before_parenthesis(city1)
|
139 |
+
city2 = extract_before_parenthesis(city2)
|
140 |
+
if i==0 and city1 != question['org']:
|
141 |
+
return False, f"The first day's city should be {question['org']}."
|
142 |
+
|
143 |
+
city_list += [city1, city2]
|
144 |
+
|
145 |
+
else:
|
146 |
+
city_list.append(extract_before_parenthesis(city_value))
|
147 |
+
|
148 |
+
if city_list[0] != city_list[-1]:
|
149 |
+
return False, "The trip should be a closed circle."
|
150 |
+
|
151 |
+
if not is_valid_city_sequence(city_list):
|
152 |
+
return False, "The city sequence is invalid."
|
153 |
+
|
154 |
+
for idx, city in enumerate(city_list):
|
155 |
+
if city not in city_state_map:
|
156 |
+
return False, f"{city} is not a valid city."
|
157 |
+
if idx not in [0,len(city_list)-1] and question['days'] >3 and city_state_map[city] != question['dest']:
|
158 |
+
return False, f"{city} is not in {question['dest']}."
|
159 |
+
|
160 |
+
return True, None
|
161 |
+
|
162 |
+
|
163 |
+
def is_valid_restaurants(question, tested_data):
|
164 |
+
|
165 |
+
restaurants_list = []
|
166 |
+
|
167 |
+
for i in range(min(question['days'],len(tested_data))):
|
168 |
+
unit = tested_data[i]
|
169 |
+
|
170 |
+
if 'breakfast' in unit and unit['breakfast'] and unit['breakfast'] != '-':
|
171 |
+
if unit['breakfast'] not in restaurants_list:
|
172 |
+
restaurants_list.append(unit['breakfast'])
|
173 |
+
else:
|
174 |
+
return False, f"The restaurant in day {i+1} breakfast is repeated."
|
175 |
+
# elif 'breakfast' not in unit :
|
176 |
+
# return False, f"No Breakfast Info."
|
177 |
+
|
178 |
+
if 'lunch' in unit and unit['lunch'] and unit['lunch'] != '-':
|
179 |
+
if unit['lunch'] not in restaurants_list:
|
180 |
+
restaurants_list.append(unit['lunch'])
|
181 |
+
else:
|
182 |
+
return False, f"The restaurant in day {i+1} lunch {unit['lunch']} is repeated."
|
183 |
+
# elif 'lunch' not in unit:
|
184 |
+
# return False, f"No Lunch Info."
|
185 |
+
|
186 |
+
if 'dinner' in unit and unit['dinner'] and unit['dinner'] != '-':
|
187 |
+
if unit['dinner'] not in restaurants_list:
|
188 |
+
restaurants_list.append(unit['dinner'])
|
189 |
+
else:
|
190 |
+
return False, f"The restaurant in day {i+1} dinner is repeated."
|
191 |
+
# elif 'dinner' not in unit:
|
192 |
+
# return False, f"No Dinner Info."
|
193 |
+
|
194 |
+
return True, None
|
195 |
+
|
196 |
+
def is_valid_attractions(question, tested_data):
|
197 |
+
|
198 |
+
attractions_list = []
|
199 |
+
|
200 |
+
for i in range(min(question['days'],len(tested_data))):
|
201 |
+
unit = tested_data[i]
|
202 |
+
|
203 |
+
if 'attraction' in unit and unit['attraction'] and unit['attraction'] != '-':
|
204 |
+
for attraction in unit['attraction'].split(';')[:-1]:
|
205 |
+
if attraction not in attractions_list:
|
206 |
+
attractions_list.append(attraction)
|
207 |
+
else:
|
208 |
+
return False, f"The attraction '{attraction}' in day {i+1} is repeated."
|
209 |
+
|
210 |
+
# elif 'attraction' not in unit:
|
211 |
+
# return False, f"No Attraction Info."
|
212 |
+
|
213 |
+
return True, None
|
214 |
+
|
215 |
+
def is_valid_transportation(question, tested_data):
|
216 |
+
|
217 |
+
if tested_data[0]['transportation'] and tested_data[0]['transportation'] != '-':
|
218 |
+
transportation_list = [transportation_match(tested_data[0]['transportation'])]
|
219 |
+
|
220 |
+
else:
|
221 |
+
return False, "The transportation in day 1 should not be empty."
|
222 |
+
|
223 |
+
for i in range(min(question['days'],len(tested_data))):
|
224 |
+
unit = tested_data[i]
|
225 |
+
|
226 |
+
if 'transportation' in unit and unit['transportation'] and unit['transportation'] != '-':
|
227 |
+
transportation_list.append(transportation_match(unit['transportation']))
|
228 |
+
# elif 'transportation' not in unit:
|
229 |
+
# return False, f"No Transportation Info."
|
230 |
+
|
231 |
+
if (('Self-driving' in transportation_list) and ('Flight' in transportation_list)) or (('Taxi' in transportation_list) and ('Self-driving' in transportation_list)):
|
232 |
+
return False, "The transportation is conflicting."
|
233 |
+
|
234 |
+
return True, None
|
235 |
+
|
236 |
+
def is_valid_information_in_current_city(question, tested_data):
|
237 |
+
|
238 |
+
for i in range(min(question['days'],len(tested_data))):
|
239 |
+
unit = tested_data[i]
|
240 |
+
current_city = unit['current_city']
|
241 |
+
final_city_list = []
|
242 |
+
|
243 |
+
if 'from' in current_city:
|
244 |
+
city1, city2 = extract_from_to(current_city)
|
245 |
+
city1 = extract_before_parenthesis(city1)
|
246 |
+
city2 = extract_before_parenthesis(city2)
|
247 |
+
final_city_list = [city1, city2]
|
248 |
+
else:
|
249 |
+
final_city_list = extract_before_parenthesis(current_city)
|
250 |
+
|
251 |
+
if 'transportation' in unit and unit['transportation'] and unit['transportation'] != '-':
|
252 |
+
for city in final_city_list:
|
253 |
+
if city not in unit['transportation']:
|
254 |
+
# print(city)
|
255 |
+
return False, f"The transportation in day {i+1} is invalid city choice."
|
256 |
+
# elif 'transportation' not in unit:
|
257 |
+
# return False, f"No Transportation Info."
|
258 |
+
|
259 |
+
if 'breakfast' in unit and unit['breakfast'] and unit['breakfast'] != '-':
|
260 |
+
|
261 |
+
flag = False
|
262 |
+
|
263 |
+
for city in final_city_list:
|
264 |
+
if city in unit['breakfast']:
|
265 |
+
flag = True
|
266 |
+
|
267 |
+
if not flag:
|
268 |
+
return False, f"The breakfast in day {i+1} is invalid city choice."
|
269 |
+
# elif 'breakfast' not in unit:
|
270 |
+
# return False, f"No Breakfast Info."
|
271 |
+
|
272 |
+
if 'lunch' in unit and unit['lunch'] and unit['lunch'] != '-':
|
273 |
+
flag = False
|
274 |
+
|
275 |
+
for city in final_city_list:
|
276 |
+
if city in unit['lunch']:
|
277 |
+
flag = True
|
278 |
+
|
279 |
+
if not flag:
|
280 |
+
return False, f"The lunch in day {i+1} is invalid city choice."
|
281 |
+
# elif 'lunch' not in unit:
|
282 |
+
# return False, f"No Lunch Info."
|
283 |
+
|
284 |
+
if 'dinner' in unit and unit['dinner'] and unit['dinner'] != '-':
|
285 |
+
flag = False
|
286 |
+
|
287 |
+
for city in final_city_list:
|
288 |
+
if city in unit['dinner']:
|
289 |
+
flag = True
|
290 |
+
|
291 |
+
if not flag:
|
292 |
+
return False, f"The dinner in day {i+1} is invalid city choice."
|
293 |
+
# elif 'dinner' not in unit:
|
294 |
+
# return False, f"No Dinner Info."
|
295 |
+
|
296 |
+
if 'attraction' in unit and unit['attraction'] and unit['attraction'] != '-':
|
297 |
+
|
298 |
+
attraction_list = unit['attraction'].split(';')[:-1]
|
299 |
+
|
300 |
+
for attraction in attraction_list:
|
301 |
+
flag = False
|
302 |
+
for city in final_city_list:
|
303 |
+
if city in attraction:
|
304 |
+
flag = True
|
305 |
+
if not flag:
|
306 |
+
return False, f"The attraction in day {i+1} is invalid city choice."
|
307 |
+
|
308 |
+
# elif 'attraction' not in unit:
|
309 |
+
# return False, f"No Attraction Info."
|
310 |
+
|
311 |
+
|
312 |
+
if 'accommodation' in unit and unit['accommodation'] and unit['accommodation'] != '-':
|
313 |
+
|
314 |
+
if final_city_list[-1] not in unit['accommodation']:
|
315 |
+
return False, f"The accommodation in day {i+1} is invalid city choice."
|
316 |
+
|
317 |
+
# elif 'accommodation' not in unit:
|
318 |
+
# return False, f"No Accommodation Info."
|
319 |
+
|
320 |
+
return True, None
|
321 |
+
|
322 |
+
# hallucination
|
323 |
+
def is_valid_information_in_sandbox(question, tested_data):
|
324 |
+
|
325 |
+
for i in range(min(question['days'],len(tested_data))):
|
326 |
+
unit = tested_data[i]
|
327 |
+
|
328 |
+
if unit['transportation'] and unit['transportation'] != '-':
|
329 |
+
value = unit['transportation']
|
330 |
+
org_city, dest_city = extract_from_to(value)
|
331 |
+
if org_city == None or dest_city == None:
|
332 |
+
org_city, dest_city = extract_from_to(unit['current_city'])
|
333 |
+
if 'flight number' in value.lower():
|
334 |
+
try:
|
335 |
+
org_city = extract_before_parenthesis(org_city)
|
336 |
+
dest_city = extract_before_parenthesis(dest_city)
|
337 |
+
except TypeError:
|
338 |
+
raise ValueError("The transportation {} in day {} can not be parsed.".format(value,i+1))
|
339 |
+
# print(value)
|
340 |
+
if len(flight.data[(flight.data['Flight Number'] == value.split('Flight Number: ')[1].split(',')[0]) & (flight.data['OriginCityName']==org_city) & (flight.data['DestCityName']==dest_city)]) < 1:
|
341 |
+
return False, f"The flight number in day {i+1} is invalid in the sandbox."
|
342 |
+
|
343 |
+
elif 'self-driving' in value.lower() or 'taxi' in value.lower():
|
344 |
+
try:
|
345 |
+
org_city = extract_before_parenthesis(org_city)
|
346 |
+
dest_city = extract_before_parenthesis(dest_city)
|
347 |
+
except TypeError:
|
348 |
+
org_city = '-'
|
349 |
+
dest_city = '-'
|
350 |
+
print("The transportation {} in day {} can not be parsed and '-' will be used instead.".format(value,i+1))
|
351 |
+
|
352 |
+
if 'self-driving' in value.lower():
|
353 |
+
if googleDistanceMatrix.run_for_evaluation(org_city, dest_city, mode='self-driving')['cost'] == None:
|
354 |
+
return False, f"The self-driving in day {i+1} is invalid in the sandbox."
|
355 |
+
else:
|
356 |
+
if googleDistanceMatrix.run_for_evaluation(org_city, dest_city, mode='taxi')['cost'] == None:
|
357 |
+
return False, f"The taxi in day {i+1} is invalid in the sandbox."
|
358 |
+
|
359 |
+
if 'breakfast' in unit and unit['breakfast'] and unit['breakfast'] != '-':
|
360 |
+
name, city = get_valid_name_city(unit['breakfast'])
|
361 |
+
if len(restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]) < 1:
|
362 |
+
return False, f"The breakfast in day {i+1} is invalid in the sandbox."
|
363 |
+
# elif 'breakfast' not in unit:
|
364 |
+
# return False, f"No Breakfast Info."
|
365 |
+
|
366 |
+
if 'lunch' in unit and unit['lunch'] and unit['lunch'] != '-':
|
367 |
+
name, city = get_valid_name_city(unit['lunch'])
|
368 |
+
if len(restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]) < 1:
|
369 |
+
return False, f"The lunch in day {i+1} is invalid in the sandbox."
|
370 |
+
# elif 'lunch' not in unit:
|
371 |
+
# return False, f"No Lunch Info."
|
372 |
+
|
373 |
+
if 'dinner' in unit and unit['dinner'] and unit['dinner'] != '-':
|
374 |
+
name, city = get_valid_name_city(unit['dinner'])
|
375 |
+
if len(restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]) < 1:
|
376 |
+
return False, f"The dinner in day {i+1} is invalid in the sandbox."
|
377 |
+
# elif 'dinner' not in unit:
|
378 |
+
# return False, f"No Dinner Info."
|
379 |
+
|
380 |
+
if 'attraction' in unit and unit['attraction'] and unit['attraction'] != '-':
|
381 |
+
attractions_list = unit['attraction'].split(';')[:-1]
|
382 |
+
for attraction in attractions_list:
|
383 |
+
name, city = get_valid_name_city(attraction)
|
384 |
+
if len(attractions.data[(attractions.data['Name'].astype(str).str.contains(re.escape(name))) & (attractions.data['City'] == city)]) < 1:
|
385 |
+
return False, f"The attraction {attraction} in day {i+1} is invalid in the sandbox."
|
386 |
+
# elif 'attraction' not in unit:
|
387 |
+
# return False, f"No Attraction Info."
|
388 |
+
|
389 |
+
if 'accommodation' in unit and unit['accommodation'] and unit['accommodation'] != '-':
|
390 |
+
name, city = get_valid_name_city(unit['accommodation'])
|
391 |
+
# print(name,city)
|
392 |
+
# print(accommodation.data[accommodation.data['NAME'].astype(str).str.contains(re.escape(name))])
|
393 |
+
if len(accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]) < 1:
|
394 |
+
return False, f"The accommodation in day {i+1} is invalid in the sandbox."
|
395 |
+
# elif 'accommodation' not in unit:
|
396 |
+
# return False, f"No Accommodation Info."
|
397 |
+
|
398 |
+
return True, None
|
399 |
+
|
400 |
+
|
401 |
+
def is_valid_accommodaton(question, tested_data):
|
402 |
+
data = []
|
403 |
+
for i in range(min(question['days'],len(tested_data))):
|
404 |
+
unit = tested_data[i]
|
405 |
+
|
406 |
+
if 'accommodation' not in unit:
|
407 |
+
return False, f"No Accommodation Info."
|
408 |
+
|
409 |
+
data.append(unit['accommodation'])
|
410 |
+
# data = [unit['accommodation'] for unit in tested_data]
|
411 |
+
consectutive_accommodation = count_consecutive_values(data)
|
412 |
+
for unit in consectutive_accommodation:
|
413 |
+
# print(unit)
|
414 |
+
if unit and unit[0] not in ['-',''] :
|
415 |
+
name, city = get_valid_name_city(unit[0])
|
416 |
+
# print(unit[0],name,city)
|
417 |
+
# try:
|
418 |
+
if len(accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]) == 1 and unit[1] < accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)].iloc[0]['minimum nights']:
|
419 |
+
return False, f"The accommodation {unit[0]} do not obey the minumum nights rule."
|
420 |
+
# can not parse data
|
421 |
+
# except re.error:
|
422 |
+
# continue
|
423 |
+
|
424 |
+
return True, None
|
425 |
+
|
426 |
+
def is_valid_visiting_city_number(question, tested_data):
|
427 |
+
|
428 |
+
city_set = set()
|
429 |
+
|
430 |
+
|
431 |
+
for i in range(min(question['days'],len(tested_data))):
|
432 |
+
city_value = tested_data[i]['current_city']
|
433 |
+
|
434 |
+
if 'from' in city_value:
|
435 |
+
city1, city2 = extract_from_to(city_value)
|
436 |
+
city1 = extract_before_parenthesis(city1)
|
437 |
+
city2 = extract_before_parenthesis(city2)
|
438 |
+
if i==0 and city1 != question['org']:
|
439 |
+
return False, f"The first day's city should be {question['org']}."
|
440 |
+
|
441 |
+
city_set.add(city1)
|
442 |
+
city_set.add(city2)
|
443 |
+
|
444 |
+
else:
|
445 |
+
city_set.add(extract_before_parenthesis(city_value))
|
446 |
+
|
447 |
+
city_set.discard(question['org'])
|
448 |
+
|
449 |
+
if len(city_set) != question['visiting_city_number']:
|
450 |
+
return False, f"The number of visiting cities should be {question['visiting_city_number']}."
|
451 |
+
|
452 |
+
return True, None
|
453 |
+
|
454 |
+
def is_valid_days(question, tested_data):
|
455 |
+
lens = 0
|
456 |
+
for i in range(min(question['days'],len(tested_data))):
|
457 |
+
if tested_data[i] != {} and tested_data[i]['current_city'] != "You don't need to fill in the information for this or later days.":
|
458 |
+
lens += 1
|
459 |
+
|
460 |
+
if lens != question['days']:
|
461 |
+
# print(lens)
|
462 |
+
return False, f"The number of days should be {question['days']}."
|
463 |
+
else:
|
464 |
+
return True, None
|
465 |
+
|
466 |
+
def is_not_absent(question, tested_data):
|
467 |
+
needed_info = 6 * question['days']
|
468 |
+
total_valid_info = 0
|
469 |
+
|
470 |
+
if not is_valid_days(question, tested_data)[0]:
|
471 |
+
return False, "Invalid Days"
|
472 |
+
|
473 |
+
if not is_valid_visiting_city_number(question, tested_data)[0]:
|
474 |
+
return False, "Invalid City Number"
|
475 |
+
|
476 |
+
for i in range(min(question['days'],len(tested_data))):
|
477 |
+
unit = tested_data[i]
|
478 |
+
|
479 |
+
if 'transportation' not in unit:
|
480 |
+
return False, f"No Transportation Info."
|
481 |
+
|
482 |
+
if 'breakfast' not in unit:
|
483 |
+
return False, f"No Breakfast Info."
|
484 |
+
|
485 |
+
if 'lunch' not in unit:
|
486 |
+
return False, f"No Lunch Info."
|
487 |
+
|
488 |
+
if 'dinner' not in unit:
|
489 |
+
return False, f"No Dinner Info."
|
490 |
+
|
491 |
+
if 'attraction' not in unit:
|
492 |
+
return False, f"No Attraction Info."
|
493 |
+
|
494 |
+
if 'accommodation' not in unit:
|
495 |
+
return False, f"No Accommodation Info."
|
496 |
+
|
497 |
+
if ('from ' in unit['current_city'] or 'to ' in unit['current_city']) and unit['transportation'] in ['','-']:
|
498 |
+
return False, f"No transportation in day {i+1} is not allowed."
|
499 |
+
|
500 |
+
if ('from ' not in unit['current_city'] and ' to ' not in unit['current_city']) and unit['attraction'] in ['','-']:
|
501 |
+
return False, f"No attaction in day {i+1} is not allowed."
|
502 |
+
|
503 |
+
if i != question['days'] - 1 and unit['accommodation'] in ['','-']:
|
504 |
+
return False, f"No accommodation in day {i+1} is not allowed."
|
505 |
+
|
506 |
+
if (unit['breakfast'] in ['','-'] or unit['lunch'] in ['','-'] or unit['dinner'] in ['','-']) and 'from ' not in unit['current_city']:
|
507 |
+
return False, f"No meal in day {i+1} is not allowed."
|
508 |
+
|
509 |
+
|
510 |
+
for key in unit:
|
511 |
+
if unit[key] and unit[key] != '-':
|
512 |
+
total_valid_info += 1
|
513 |
+
|
514 |
+
|
515 |
+
if total_valid_info * 1.0 / needed_info < 0.5:
|
516 |
+
return False, f"The absent information is more than 50%."
|
517 |
+
|
518 |
+
return True, None
|
519 |
+
|
520 |
+
|
521 |
+
def evaluation(query_data, tested_data):
|
522 |
+
return_info = {}
|
523 |
+
return_info['is_reasonalbe_visiting_city'] = is_reasonalbe_visiting_city(query_data, tested_data)
|
524 |
+
return_info['is_valid_restaurants'] = is_valid_restaurants(query_data, tested_data)
|
525 |
+
return_info['is_valid_attractions'] = is_valid_attractions(query_data, tested_data)
|
526 |
+
return_info['is_valid_accommodation'] = is_valid_accommodaton(query_data, tested_data)
|
527 |
+
return_info['is_valid_transportation'] = is_valid_transportation(query_data, tested_data)
|
528 |
+
return_info['is_valid_information_in_current_city'] = is_valid_information_in_current_city(query_data, tested_data)
|
529 |
+
return_info['is_valid_information_in_sandbox'] = is_valid_information_in_sandbox(query_data, tested_data)
|
530 |
+
return_info['is_not_absent'] = is_not_absent(query_data, tested_data)
|
531 |
+
return return_info
|
532 |
+
|
533 |
+
def boolean_evaluation(query_data, tested_data):
|
534 |
+
return_info = {}
|
535 |
+
return_info['is_reasonalbe_visiting_city'] = is_reasonalbe_visiting_city(query_data, tested_data)
|
536 |
+
return_info['is_valid_restaurants'] = is_valid_restaurants(query_data, tested_data)
|
537 |
+
return_info['is_valid_accommodation'] = is_valid_accommodaton(query_data, tested_data)
|
538 |
+
return_info['is_valid_attractions'] = is_valid_attractions(query_data, tested_data)
|
539 |
+
return_info['is_valid_transportation'] = is_valid_transportation(query_data, tested_data)
|
540 |
+
return_info['is_valid_information_in_current_city'] = is_valid_information_in_current_city(query_data, tested_data)
|
541 |
+
return_info['is_valid_information_in_sandbox'] = is_valid_information_in_sandbox(query_data, tested_data)
|
542 |
+
return_info['is_not_absent'] = is_not_absent(query_data, tested_data)
|
543 |
+
for key in return_info:
|
544 |
+
if return_info[key][0] == False:
|
545 |
+
print(return_info[key][1])
|
546 |
+
return False
|
547 |
+
return True
|
548 |
+
|
549 |
+
# if __name__ == '__main__':
|
550 |
+
# number_list = extract_numbers_from_filenames('/home/xj/toolAugEnv/code/toolConstraint/data/annotation/lrz')
|
551 |
+
# # json_data = json.load(open('/home/xj/toolAugEnv/code/toolConstraint/data/annotation/x/annotation_4.json'))
|
552 |
+
# query_data = load_line_json_data('/home/xj/toolAugEnv/code/toolConstraint/data/query/lrz.jsonl')
|
553 |
+
# for idx in number_list:
|
554 |
+
# json_data = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/lrz/annotation_{idx}.json'))
|
555 |
+
# print(str(idx), evaluation(query_data[idx-1], json_data))
|
556 |
+
# # json_data = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/turbo16k-turbo16k/plan_{idx}.json'))
|
557 |
+
# # query_data = load_line_json_data('/home/xj/toolAugEnv/code/toolConstraint/data/query/test.jsonl')[idx-1]
|
558 |
+
# # help me write all function name in this file, just the name
|
559 |
+
# #
|
560 |
+
# # list all function name in this file
|
561 |
+
# # ['is_reasonalbe_visiting_city', 'is_valiable_restaurants', 'is_valiable_attractions', 'is_valiable_transportation', 'is_valid_information_in_current_city', 'is_valid_information_in_sandbox']
|
562 |
+
# # print(is_valiable_restaurants(query_data, json_data))
|
563 |
+
|
564 |
+
# if __name__ == "__main__":
|
565 |
+
# user = 'zk'
|
566 |
+
# query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/query/{user}.jsonl')
|
567 |
+
# idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
|
568 |
+
# commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
|
569 |
+
# for idx in idx_number_list:
|
570 |
+
# print(idx)
|
571 |
+
# query_data = query_data_list[idx-1]
|
572 |
+
# generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/turbo16k-turbo16k/{user}/plan_{idx}.json'))
|
573 |
+
# # generated_plan = generated_plan[:-1]
|
574 |
+
# if generated_plan[-1]['gpt-3.5-turbo-16k-result'] != 'Plan Fail':
|
575 |
+
# info_box = evaluation(query_data, generated_plan[-1]['gpt-3.5-turbo-16k-result'])
|
576 |
+
# generated_plan[-1]['toolAug-commonsense'] = info_box
|
577 |
+
# else:
|
578 |
+
# generated_plan[-1]['toolAug-commonsense'] = None
|
579 |
+
# info_box = None
|
580 |
+
# commonsense_statistic[query_data['level']][query_data['days']].append(info_box)
|
581 |
+
# with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/turbo16k-turbo16k/{user}/plan_{idx}.json','w') as f:
|
582 |
+
# json.dump(generated_plan,f)
|
583 |
+
|
584 |
+
# with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/turbo16k-turbo16k/{user}/commonsense_statistic.json','w') as f:
|
585 |
+
# json.dump(commonsense_statistic,f)
|
586 |
+
|
587 |
+
# if __name__ == "__main__":
|
588 |
+
# user = 'all'
|
589 |
+
# model_type = ['chatgpt','gpt4','greedy_search'][2]
|
590 |
+
# query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/query/{user}.jsonl')
|
591 |
+
# # idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
|
592 |
+
# idx_number_list = [i for i in range(1,501)]
|
593 |
+
# commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
|
594 |
+
|
595 |
+
# for idx in idx_number_list:
|
596 |
+
# print(idx)
|
597 |
+
# query_data = query_data_list[idx-1]
|
598 |
+
# generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/pre2/{user}/plan_{idx}.json'))
|
599 |
+
# # generated_plan = generated_plan[:-1]
|
600 |
+
# if model_type == 'greedy_search':
|
601 |
+
# info_box = evaluation(query_data, generated_plan[-1][f'greedy_search_plan'])
|
602 |
+
# else:
|
603 |
+
# info_box = evaluation(query_data, generated_plan[-1][f'{model_type}_human_collected_info_results_parsed'])
|
604 |
+
# generated_plan[-1][f'{model_type}_with_human_collected_commonsense'] = info_box
|
605 |
+
# commonsense_statistic[query_data['level']][query_data['days']].append(info_box)
|
606 |
+
|
607 |
+
# with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/pre2/{user}/plan_{idx}.json','w') as f:
|
608 |
+
# json.dump(generated_plan,f)
|
609 |
+
|
610 |
+
# with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/pre2/{user}/{model_type}_with_human_collected_commonsense_statistic.json','w') as f:
|
611 |
+
# json.dump(commonsense_statistic,f)
|
612 |
+
|
613 |
+
|
614 |
+
# if __name__ == "__main__":
|
615 |
+
# user = 'all'
|
616 |
+
# query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/query/{user}.jsonl')
|
617 |
+
# idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
|
618 |
+
# hardConstraint_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
|
619 |
+
# not_satified = []
|
620 |
+
# for idx in tqdm(idx_number_list):
|
621 |
+
# # print(idx)
|
622 |
+
# query_data = query_data_list[idx-1]
|
623 |
+
# generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}/annotation_{idx}.json'))
|
624 |
+
|
625 |
+
# if not boolean_evaluation(query_data, generated_plan):
|
626 |
+
# not_satified.append(idx)
|
627 |
+
# print(idx)
|
628 |
+
# generated_plan = generated_plan[:-1]
|
629 |
+
# print(not_satified)
|
630 |
+
|
631 |
+
if __name__ == "__main__":
|
632 |
+
set_type = ["train",'dev','test'][0]
|
633 |
+
query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{set_type}/query/query.jsonl')
|
634 |
+
# idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{set_type}/plan')
|
635 |
+
commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
|
636 |
+
not_satified = []
|
637 |
+
# print( idx_number_list)
|
638 |
+
for idx in tqdm(range(1,len(query_data_list)+1)):
|
639 |
+
# print(idx)
|
640 |
+
query_data = query_data_list[idx-1]
|
641 |
+
generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{set_type}/plan/plan_{idx}.json'))
|
642 |
+
try:
|
643 |
+
store_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{set_type}/plan_{idx}.json'))
|
644 |
+
except FileNotFoundError:
|
645 |
+
store_plan = [{}]
|
646 |
+
info_box = evaluation(query_data,generated_plan[1])
|
647 |
+
# if not boolean_evaluation(query_data, generated_plan[1]):
|
648 |
+
# not_satified.append(idx)
|
649 |
+
# print(idx)
|
650 |
+
# print(store_plan[-1])
|
651 |
+
store_plan[-1][f'human_anno_commonsense_constraint'] = info_box
|
652 |
+
with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{set_type}/plan_{idx}.json','w') as f:
|
653 |
+
json.dump(store_plan,f)
|
654 |
+
commonsense_statistic[query_data['level']][query_data['days']].append(info_box)
|
655 |
+
print(not_satified)
|
656 |
+
with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{set_type}/human_anno_commonsense_constraint.json','w') as f:
|
657 |
+
json.dump(commonsense_statistic,f)
|
658 |
+
|
659 |
+
# if __name__ == "__main__":
|
660 |
+
# user = 'all'
|
661 |
+
# model_type = ['chatgpt','gpt4'][1]
|
662 |
+
# query_data_list = load_line_json_data(f'/home/xj/toolAugEnv/code/toolConstraint/data/query/{user}.jsonl')
|
663 |
+
# # idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
|
664 |
+
# idx_number_list = [i for i in range(1,501)]
|
665 |
+
# commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
|
666 |
+
# cnt = 0
|
667 |
+
# for idx in idx_number_list:
|
668 |
+
# # print(idx)
|
669 |
+
# query_data = query_data_list[idx-1]
|
670 |
+
# generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/pre/{user}/plan_{idx}.json'))[-1]['gpt4_human_collected_info_results_parsed']
|
671 |
+
# # generated_plan = generated_plan[:-1]
|
672 |
+
|
673 |
+
# if not boolean_evaluation(query_data, generated_plan):
|
674 |
+
# cnt += 1
|
675 |
+
# print(idx)
|
676 |
+
# print(cnt)
|
677 |
+
|
678 |
+
# if __name__ == "__main__":
|
679 |
+
# parser = argparse.ArgumentParser(description="")
|
680 |
+
# # model_type = ['gpt-3.5-turbo-1106','gpt-4-1106-preview','greedy_search','mistral-7B-32K','gemini2','mixtral','gpt-3.5-turbo-11062'][-1]
|
681 |
+
# # method = ['direct','cot','react','reflexion','tool-use'][-1]
|
682 |
+
# # set_type = ['dev','test'][0]
|
683 |
+
# parser.add_argument("--model_type", type=str, default="gpt-3.5-turbo-1106")
|
684 |
+
# parser.add_argument("--method", type=str, default="direct")
|
685 |
+
# parser.add_argument("--set_type", type=str, default="dev")
|
686 |
+
# args = parser.parse_args()
|
687 |
+
# directory = f'/home/xj/toolAugEnv/code/toolConstraint/data/final_data/{args.set_type}'
|
688 |
+
# query_data_list = load_line_json_data(os.path.join(directory, 'query/query.jsonl'))
|
689 |
+
# # idx_number_list = extract_numbers_from_filenames(f'/home/xj/toolAugEnv/code/toolConstraint/data/annotation/{user}')
|
690 |
+
# idx_number_list = [i for i in range(1,len(query_data_list)+1)]
|
691 |
+
# commonsense_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
|
692 |
+
# deliver_cnt = 0
|
693 |
+
# if args.method == 'tool-use':
|
694 |
+
# suffix = ''
|
695 |
+
# else:
|
696 |
+
# suffix = '_with_human_info'
|
697 |
+
# for idx in tqdm(idx_number_list):
|
698 |
+
# # print(idx)
|
699 |
+
# query_data = query_data_list[idx-1]
|
700 |
+
# generated_plan = json.load(open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{args.set_type}/plan_{idx}.json'))
|
701 |
+
# # generated_plan = generated_plan[:-1]
|
702 |
+
# if args.model_type == 'greedy_search':
|
703 |
+
# info_box = evaluation(query_data, generated_plan[-1][f'greedy_search_plan'])
|
704 |
+
# else:
|
705 |
+
# if args.method == 'tool-use':
|
706 |
+
# suffix2 = ''
|
707 |
+
# else:
|
708 |
+
# suffix2 = '_collected'
|
709 |
+
# if generated_plan[-1][f'{args.model_type}_{args.method}{suffix2}_info_results'] and generated_plan[-1][f'{args.model_type}_{args.method}{suffix2}_info_results']!='Max Token Length Exceeded.':
|
710 |
+
# try:
|
711 |
+
# info_box = evaluation(query_data, generated_plan[-1][f'{args.model_type}_{args.method}{suffix}_results_parsed'])
|
712 |
+
# except KeyError:
|
713 |
+
# info_box = None
|
714 |
+
# generated_plan[-1][f'{args.model_type}_{args.method}{suffix2}_info_results'] = ""
|
715 |
+
# except IndexError:
|
716 |
+
# info_box = None
|
717 |
+
# generated_plan[-1][f'{args.model_type}_{args.method}{suffix2}_info_results'] = ""
|
718 |
+
# else:
|
719 |
+
# info_box = None
|
720 |
+
# if info_box:
|
721 |
+
# deliver_cnt += 1
|
722 |
+
# generated_plan[-1][f'{args.model_type}_{args.method}{suffix}_commonsense_constraint'] = info_box
|
723 |
+
# commonsense_statistic[query_data['level']][query_data['days']].append(info_box)
|
724 |
+
|
725 |
+
# with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{args.set_type}/plan_{idx}.json','w') as f:
|
726 |
+
# json.dump(generated_plan,f)
|
727 |
+
|
728 |
+
# with open(f'/home/xj/toolAugEnv/code/toolConstraint/results/{args.set_type}/{args.model_type}_{args.method}{suffix}_commonsense_constraint.json','w') as f:
|
729 |
+
# json.dump(commonsense_statistic,f)
|
730 |
+
|
731 |
+
# if args.set_type == 'dev':
|
732 |
+
# print(f"Model:{args.model_type} Method:{args.method} Set: {args.set_type} \nDeliver Rate: {deliver_cnt/180}" )
|
733 |
+
# elif args.set_type == 'test':
|
734 |
+
# print(f"Model:{args.model_type} Method:{args.method} Set: {args.set_type} \nDeliver Rate: {deliver_cnt/1000}" )
|
735 |
+
|
evaluation/eval.py
ADDED
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from commonsenseConstraint import evaluation as commonsense_eval
|
2 |
+
from hardConstraint import evaluation as hard_eval
|
3 |
+
import json
|
4 |
+
from tqdm import tqdm
|
5 |
+
from datasets import load_dataset
|
6 |
+
|
7 |
+
|
8 |
+
def load_line_json_data(filename):
|
9 |
+
data = []
|
10 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
11 |
+
for line in f.read().strip().split('\n'):
|
12 |
+
unit = json.loads(line)
|
13 |
+
data.append(unit)
|
14 |
+
return data
|
15 |
+
|
16 |
+
def count_true_false(data):
|
17 |
+
"""Count the number of true and false values in a list."""
|
18 |
+
true_count = data.count(True)
|
19 |
+
false_count = data.count(False)
|
20 |
+
return true_count, false_count
|
21 |
+
|
22 |
+
def statistics(commonsense_statistic):
|
23 |
+
"""Generate statistics for each level and day in the given data with a different structure."""
|
24 |
+
result = {level: {day: {} for day in commonsense_statistic[level]} for level in commonsense_statistic}
|
25 |
+
|
26 |
+
for level, days in commonsense_statistic.items():
|
27 |
+
for day, dicts in days.items():
|
28 |
+
for dct in dicts:
|
29 |
+
if dct:
|
30 |
+
for key, data in dct.items():
|
31 |
+
true_count, false_count = count_true_false(data)
|
32 |
+
if key not in result[level][day]:
|
33 |
+
result[level][day][key] = {"true": 0, "false": 0}
|
34 |
+
result[level][day][key]["true"] += true_count
|
35 |
+
result[level][day][key]["false"] += false_count
|
36 |
+
|
37 |
+
return result
|
38 |
+
|
39 |
+
|
40 |
+
def eval_score(validation_or_test: str, file_path: str, TOKEN):
|
41 |
+
|
42 |
+
if validation_or_test == 'validation':
|
43 |
+
query_data_list = load_dataset('osunlp/TravelBenchEval','validation',token=TOKEN)['validation']
|
44 |
+
elif validation_or_test == 'test':
|
45 |
+
query_data_list = load_dataset('osunlp/TravelBenchEval','test',token=TOKEN)['test']
|
46 |
+
|
47 |
+
query_data_list = [x for x in query_data_list]
|
48 |
+
hardConstraint_statistic= {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
|
49 |
+
commonsenseConstraint_statistic = {level:{day:[] for day in [3,5,7]} for level in ['easy','medium','hard']}
|
50 |
+
tested_plans = load_line_json_data(file_path)
|
51 |
+
delivery_cnt = 0
|
52 |
+
plan_constraint_store = []
|
53 |
+
for idx in tqdm(range(0,len(query_data_list))):
|
54 |
+
query_data = query_data_list[idx]
|
55 |
+
tested_plan = tested_plans[idx]
|
56 |
+
if type(query_data) == str:
|
57 |
+
query_data = eval(query_data)
|
58 |
+
if type(tested_plan) == str:
|
59 |
+
tested_plan = eval(tested_plan)
|
60 |
+
if type(query_data['local_constraint']) == str:
|
61 |
+
query_data['local_constraint'] = eval(query_data['local_constraint'])
|
62 |
+
|
63 |
+
if tested_plan['plan']:
|
64 |
+
delivery_cnt += 1
|
65 |
+
commonsense_info_box = commonsense_eval(query_data,tested_plan['plan'])
|
66 |
+
else:
|
67 |
+
commonsense_info_box = None
|
68 |
+
|
69 |
+
if commonsense_info_box and commonsense_info_box['is_not_absent'][0] and commonsense_info_box['is_valid_information_in_sandbox'][0]:
|
70 |
+
hard_info_box = hard_eval(query_data,tested_plan['plan'])
|
71 |
+
else:
|
72 |
+
hard_info_box = None
|
73 |
+
|
74 |
+
plan_constraint_store.append({'commonsense_constraint':commonsense_info_box,'hard_constraint':hard_info_box})
|
75 |
+
|
76 |
+
commonsenseConstraint_statistic[query_data['level']][query_data['days']].append(commonsense_info_box)
|
77 |
+
hardConstraint_statistic[query_data['level']][query_data['days']].append(hard_info_box)
|
78 |
+
|
79 |
+
commonsenseConstraint_statistic_processed = statistics(commonsenseConstraint_statistic)
|
80 |
+
hardConstraint_statistic_processed = statistics(hardConstraint_statistic)
|
81 |
+
# print(commonsenseConstraint_statistic_processed)
|
82 |
+
# print(hardConstraint_statistic_processed)
|
83 |
+
constraint_record = {key: {day: {'house rule':0, 'cuisine':0, 'room type':0, 'transportation':0} for day in [3,5,7]} for key in ['medium','hard']}
|
84 |
+
constraint_mapping = {'house rule':'valid_room_rule','cuisine':'valid_cuisine','room type':'valid_room_type','transportation':'valid_transportation'}
|
85 |
+
mapping_constraint_record = {key: {day: {'valid_room_rule':0, 'valid_cuisine':0, 'valid_room_type':0, 'valid_transportation':0} for day in [3,5,7]} for key in ['medium','hard']}
|
86 |
+
count_record = {key:{day:0 for day in [3,5,7]} for key in ['easy','medium','hard']}
|
87 |
+
|
88 |
+
for unit in query_data_list:
|
89 |
+
count_record[unit['level']][unit['days']] += 1
|
90 |
+
for key in constraint_record['medium'][3]:
|
91 |
+
if unit['local_constraint'][key] != None:
|
92 |
+
constraint_record[unit['level']][unit['days']][key] += 1
|
93 |
+
mapping_constraint_record[unit['level']][unit['days']][constraint_mapping[key]] += 1
|
94 |
+
|
95 |
+
data_record = {key:{day:[] for day in [3,5,7]} for key in ['easy','medium','hard']}
|
96 |
+
|
97 |
+
constraint_dis_record = {"commonsense":{"pass":0,"total":0},"hard":{"pass":0,"total":0}}
|
98 |
+
|
99 |
+
for constraint in ['commonsense','hard']:
|
100 |
+
if constraint == 'commonsense':
|
101 |
+
constraint_statistic = commonsenseConstraint_statistic_processed
|
102 |
+
elif constraint == 'hard':
|
103 |
+
constraint_statistic = hardConstraint_statistic_processed
|
104 |
+
|
105 |
+
key_dict = {'commonsense':['is_valid_information_in_current_city','is_valid_information_in_sandbox','is_reasonalbe_visiting_city','is_valid_restaurants','is_valid_transportation','is_valid_attractions','is_valid_accommodation','is_not_absent'],'hard':['valid_cost','valid_room_rule','valid_cuisine','valid_room_type','valid_transportation']}
|
106 |
+
|
107 |
+
for key in constraint_statistic:
|
108 |
+
# level
|
109 |
+
for key2 in constraint_statistic[key]:
|
110 |
+
# day
|
111 |
+
# print(key2)
|
112 |
+
# key2 = eval(key2)
|
113 |
+
if key2 == -1:
|
114 |
+
print(constraint_statistic[key])
|
115 |
+
exit(0)
|
116 |
+
for key3 in key_dict[constraint]:
|
117 |
+
data_record[key][key2].append('0/0')
|
118 |
+
if key3 in constraint_statistic[key][key2]:
|
119 |
+
constraint_dis_record[constraint]['pass'] += constraint_statistic[key][key2][key3]['true']
|
120 |
+
if constraint == 'hard':
|
121 |
+
if key == 'hard' and key3 in ['valid_room_rule','valid_cuisine','valid_room_type','valid_transportation']:
|
122 |
+
data_record[key][key2][-1] = f"{constraint_statistic[key][key2][key3]['true']}/{mapping_constraint_record[key][key2][key3]}"
|
123 |
+
constraint_dis_record[constraint]['total'] += mapping_constraint_record[key][key2][key3]
|
124 |
+
elif key == 'medium' and key3 in ['valid_room_rule','valid_cuisine','valid_room_type']:
|
125 |
+
data_record[key][key2][-1] = f"{constraint_statistic[key][key2][key3]['true']}/{mapping_constraint_record[key][key2][key3]}"
|
126 |
+
constraint_dis_record[constraint]['total'] += mapping_constraint_record[key][key2][key3]
|
127 |
+
else:
|
128 |
+
data_record[key][key2][-1] = f"{constraint_statistic[key][key2][key3]['true']}/{count_record[key][key2]}"
|
129 |
+
if key3 in ['valid_cost','valid_visitng_city_number','valid_days']:
|
130 |
+
constraint_dis_record[constraint]['total'] += count_record[key][key2]
|
131 |
+
else:
|
132 |
+
data_record[key][key2][-1] = f"{constraint_statistic[key][key2][key3]['true']}/{count_record[key][key2]}"
|
133 |
+
constraint_dis_record[constraint]['total'] += count_record[key][key2]
|
134 |
+
|
135 |
+
final_all_cnt = 0
|
136 |
+
final_commonsense_cnt = 0
|
137 |
+
final_hardConstraint_cnt = 0
|
138 |
+
final_all_cnt_map = {level:0 for level in ['easy','medium','hard']}
|
139 |
+
for idx in (range(0,len(query_data_list))):
|
140 |
+
if plan_constraint_store[idx]['commonsense_constraint']:
|
141 |
+
final_commonsense_pass = True
|
142 |
+
final_hardConstraint_pass = True
|
143 |
+
for item in plan_constraint_store[idx]['commonsense_constraint']:
|
144 |
+
if plan_constraint_store[idx]['commonsense_constraint'][item][0] is not None and not plan_constraint_store[idx]['commonsense_constraint'][item][0]:
|
145 |
+
final_commonsense_pass = False
|
146 |
+
break
|
147 |
+
if plan_constraint_store[idx]['hard_constraint'] is None:
|
148 |
+
continue
|
149 |
+
for item in plan_constraint_store[idx]['hard_constraint']:
|
150 |
+
if plan_constraint_store[idx]['hard_constraint'][item][0] is not None and plan_constraint_store[idx]['hard_constraint'][item][0] == False:
|
151 |
+
final_hardConstraint_pass = False
|
152 |
+
break
|
153 |
+
|
154 |
+
if final_commonsense_pass:
|
155 |
+
final_commonsense_cnt += 1
|
156 |
+
if final_hardConstraint_pass:
|
157 |
+
final_hardConstraint_cnt += 1
|
158 |
+
if final_commonsense_pass and final_hardConstraint_pass:
|
159 |
+
final_all_cnt += 1
|
160 |
+
final_all_cnt_map[query_data_list[idx]['level']] += 1
|
161 |
+
|
162 |
+
result = {}
|
163 |
+
|
164 |
+
if validation_or_test == 'validation':
|
165 |
+
result['Delivery Rate'] = delivery_cnt / 180
|
166 |
+
result['Commonsense Constraint Micro Pass Rate'] = constraint_dis_record['commonsense']['pass'] / 1440
|
167 |
+
result['Commonsense Constraint Macro Pass Rate'] = final_commonsense_cnt / 180
|
168 |
+
result['Hard Constraint Micro Pass Rate'] = constraint_dis_record['hard']['pass'] / 420
|
169 |
+
result['Hard Constraint Macro Pass Rate'] = final_hardConstraint_cnt / 180
|
170 |
+
result['Final Pass Rate'] = final_all_cnt / 180
|
171 |
+
|
172 |
+
elif validation_or_test == 'test':
|
173 |
+
result['Delivery Rate'] = delivery_cnt / 1000
|
174 |
+
result['Commonsense Constraint Micro Pass Rate'] = constraint_dis_record['commonsense']['pass'] / 8000
|
175 |
+
result['Commonsense Constraint Macro Pass Rate'] = final_commonsense_cnt / 1000
|
176 |
+
result['Hard Constraint Micro Pass Rate'] = constraint_dis_record['hard']['pass'] / 2290
|
177 |
+
result['Hard Constraint Macro Pass Rate'] = final_hardConstraint_cnt / 1000
|
178 |
+
result['Final Pass Rate'] = final_all_cnt / 1000
|
179 |
+
|
180 |
+
return result
|
181 |
+
|
evaluation/hardConstraint.py
ADDED
@@ -0,0 +1,266 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from annotation.src.utils import get_valid_name_city,extract_before_parenthesis,extract_numbers_from_filenames
|
2 |
+
from tools.flights.apis import Flights
|
3 |
+
from tools.accommodations.apis import Accommodations
|
4 |
+
from tools.restaurants.apis import Restaurants
|
5 |
+
from tools.googleDistanceMatrix.apis import GoogleDistanceMatrix
|
6 |
+
from tools.attractions.apis import Attractions
|
7 |
+
import math
|
8 |
+
import json
|
9 |
+
import re
|
10 |
+
import numpy as np
|
11 |
+
import os
|
12 |
+
import sys
|
13 |
+
from tqdm import tqdm
|
14 |
+
import argparse
|
15 |
+
|
16 |
+
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
|
17 |
+
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
18 |
+
|
19 |
+
|
20 |
+
flight = Flights()
|
21 |
+
accommodation = Accommodations()
|
22 |
+
restaurants = Restaurants()
|
23 |
+
googleDistanceMatrix = GoogleDistanceMatrix()
|
24 |
+
attractions = Attractions()
|
25 |
+
|
26 |
+
|
27 |
+
def load_line_json_data(filename):
|
28 |
+
data = []
|
29 |
+
with open(filename, 'r', encoding='utf-8') as f:
|
30 |
+
for line in f.read().strip().split('\n'):
|
31 |
+
unit = json.loads(line)
|
32 |
+
data.append(unit)
|
33 |
+
return data
|
34 |
+
|
35 |
+
|
36 |
+
def convert_bool_values(item):
|
37 |
+
if isinstance(item, dict):
|
38 |
+
# If the item is a dictionary, recurse on each value
|
39 |
+
return {key: convert_bool_values(value) for key, value in item.items()}
|
40 |
+
elif isinstance(item, list):
|
41 |
+
# If the item is a list, recurse on each item in the list
|
42 |
+
return [convert_bool_values(value) for value in item]
|
43 |
+
elif isinstance(item, tuple):
|
44 |
+
# If the item is a tuple, recurse on each item in the tuple and repackage as a tuple
|
45 |
+
return tuple(convert_bool_values(value) for value in item)
|
46 |
+
elif isinstance(item, np.bool_): # Here we check for numpy's bool_ type
|
47 |
+
# If the item is a numpy bool_, convert it to a standard Python bool
|
48 |
+
return bool(item)
|
49 |
+
else:
|
50 |
+
# If the item is any other type, return it unchanged
|
51 |
+
return item
|
52 |
+
|
53 |
+
|
54 |
+
|
55 |
+
|
56 |
+
def extract_from_to(text: str):
|
57 |
+
"""
|
58 |
+
Extracts 'A' and 'B' from the format "from A to B" in the given text, with B ending at a comma or the end of the string.
|
59 |
+
|
60 |
+
Args:
|
61 |
+
- text (str): The input string.
|
62 |
+
|
63 |
+
Returns:
|
64 |
+
- tuple: A tuple containing 'A' and 'B'. If no match is found, returns (None, None).
|
65 |
+
"""
|
66 |
+
pattern = r"from\s+(.+?)\s+to\s+([^,]+)(?=[,\s]|$)"
|
67 |
+
matches = re.search(pattern, text)
|
68 |
+
return matches.groups() if matches else (None, None)
|
69 |
+
|
70 |
+
|
71 |
+
def get_total_cost(question, tested_data):
|
72 |
+
total_cost = 0
|
73 |
+
for i in range(min(question['days'],len(tested_data))):
|
74 |
+
unit = tested_data[i]
|
75 |
+
# transporation
|
76 |
+
if unit['transportation'] and unit['transportation'] != '-':
|
77 |
+
value = unit['transportation']
|
78 |
+
org_city, dest_city = extract_from_to(value)
|
79 |
+
if org_city == None or dest_city == None:
|
80 |
+
org_city, dest_city = extract_from_to(unit['current_city'])
|
81 |
+
|
82 |
+
if org_city == None or dest_city == None:
|
83 |
+
pass
|
84 |
+
else:
|
85 |
+
if 'flight number' in value.lower():
|
86 |
+
res = flight.data[flight.data['Flight Number'] == value.split('Flight Number: ')[1].split(',')[0]]
|
87 |
+
if len(res) > 0:
|
88 |
+
total_cost += res['Price'].values[0] * question['people_number']
|
89 |
+
|
90 |
+
elif 'self-driving' in value.lower() or 'taxi' in value.lower():
|
91 |
+
if 'self-driving' in value.lower():
|
92 |
+
# print(org_city,dest_city)
|
93 |
+
cost = googleDistanceMatrix.run_for_evaluation(org_city,dest_city,'self-driving')['cost']
|
94 |
+
total_cost += cost * math.ceil(question['people_number'] * 1.0 / 5)
|
95 |
+
else:
|
96 |
+
cost = googleDistanceMatrix.run_for_evaluation(org_city,dest_city,'taxi')['cost']
|
97 |
+
total_cost += cost * math.ceil(question['people_number'] * 1.0 / 4)
|
98 |
+
|
99 |
+
# breakfast
|
100 |
+
if unit['breakfast'] and unit['breakfast'] != '-':
|
101 |
+
name, city = get_valid_name_city(unit['breakfast'])
|
102 |
+
res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
|
103 |
+
if len(res) > 0:
|
104 |
+
total_cost += res['Average Cost'].values[0] * question['people_number']
|
105 |
+
|
106 |
+
|
107 |
+
# lunch
|
108 |
+
if unit['lunch'] and unit['lunch'] != '-':
|
109 |
+
name, city = get_valid_name_city(unit['lunch'])
|
110 |
+
res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
|
111 |
+
if len(res) > 0:
|
112 |
+
total_cost += res['Average Cost'].values[0] * question['people_number']
|
113 |
+
|
114 |
+
# dinner
|
115 |
+
if unit['dinner'] and unit['dinner'] != '-':
|
116 |
+
name, city = get_valid_name_city(unit['dinner'])
|
117 |
+
res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
|
118 |
+
if len(res) > 0:
|
119 |
+
total_cost += res['Average Cost'].values[0] * question['people_number']
|
120 |
+
|
121 |
+
# accommodation
|
122 |
+
if unit['accommodation'] and unit['accommodation'] != '-':
|
123 |
+
name, city = get_valid_name_city(unit['accommodation'])
|
124 |
+
res = accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]
|
125 |
+
if len(res) > 0:
|
126 |
+
total_cost += res['price'].values[0] * math.ceil(question['people_number'] * 1.0 / res['maximum occupancy'].values[0])
|
127 |
+
# print(total_cost)
|
128 |
+
return total_cost
|
129 |
+
|
130 |
+
|
131 |
+
def is_valid_room_rule(question, tested_data):
|
132 |
+
|
133 |
+
if question['local_constraint']['house rule'] is None:
|
134 |
+
return None,None
|
135 |
+
|
136 |
+
for i in range(min(question['days'],len(tested_data))):
|
137 |
+
unit = tested_data[i]
|
138 |
+
if unit['accommodation'] and unit['accommodation'] != '-':
|
139 |
+
name, city = get_valid_name_city(unit['accommodation'])
|
140 |
+
res = accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]
|
141 |
+
if len(res) > 0:
|
142 |
+
if question['local_constraint']['house rule'] == 'smoking' and 'No smoking' in str(res['house_rules'].values[0]):
|
143 |
+
return False, f"The house rule should be {question['local_constraint']['house rule']}."
|
144 |
+
if question['local_constraint']['house rule'] == 'parities' and 'No parties' in str(res['house_rules'].values[0]):
|
145 |
+
return False, f"The house rule should be {question['local_constraint']['house rule']}."
|
146 |
+
if question['local_constraint']['house rule'] == 'children under 10' and 'No children under 10' in str(res['house_rules'].values[0]):
|
147 |
+
return False, f"The house rule should be {question['local_constraint']['house rule']}."
|
148 |
+
if question['local_constraint']['house rule'] == 'visitors' and 'No visitors' in str(res['house_rules'].values[0]):
|
149 |
+
return False, f"The house rule should be {question['local_constraint']['house rule']}."
|
150 |
+
if question['local_constraint']['house rule'] == 'pets' and 'No pets' in str(res['house_rules'].values[0]):
|
151 |
+
return False, f"The house rule should be {question['local_constraint']['house rule']}."
|
152 |
+
|
153 |
+
|
154 |
+
return True, None
|
155 |
+
|
156 |
+
|
157 |
+
|
158 |
+
def is_valid_cuisine(question, tested_data):
|
159 |
+
cuisine_set = set()
|
160 |
+
if question['local_constraint']['cuisine']:
|
161 |
+
for i in range(min(question['days'],len(tested_data))):
|
162 |
+
unit = tested_data[i]
|
163 |
+
|
164 |
+
if unit['breakfast'] and unit['breakfast'] != '-':
|
165 |
+
name, city = get_valid_name_city(unit['breakfast'])
|
166 |
+
if city == question['org']:
|
167 |
+
continue
|
168 |
+
res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
|
169 |
+
if len(res) > 0:
|
170 |
+
for cuisine in question['local_constraint']['cuisine']:
|
171 |
+
if cuisine in res.iloc[0]['Cuisines']:
|
172 |
+
cuisine_set.add(cuisine)
|
173 |
+
|
174 |
+
if unit['lunch'] and unit['lunch'] != '-':
|
175 |
+
name, city = get_valid_name_city(unit['lunch'])
|
176 |
+
if city == question['org']:
|
177 |
+
continue
|
178 |
+
res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
|
179 |
+
if len(res) > 0:
|
180 |
+
for cuisine in question['local_constraint']['cuisine']:
|
181 |
+
if cuisine in res.iloc[0]['Cuisines']:
|
182 |
+
cuisine_set.add(cuisine)
|
183 |
+
|
184 |
+
if unit['dinner'] and unit['dinner'] != '-':
|
185 |
+
name, city = get_valid_name_city(unit['dinner'])
|
186 |
+
if city == question['org']:
|
187 |
+
continue
|
188 |
+
res = restaurants.data[(restaurants.data['Name'].astype(str).str.contains(re.escape(name))) & (restaurants.data['City'] == city)]
|
189 |
+
if len(res) > 0:
|
190 |
+
for cuisine in question['local_constraint']['cuisine']:
|
191 |
+
if cuisine in res.iloc[0]['Cuisines']:
|
192 |
+
cuisine_set.add(cuisine)
|
193 |
+
|
194 |
+
if len(cuisine_set) == len(question['local_constraint']['cuisine']):
|
195 |
+
return True, None
|
196 |
+
else:
|
197 |
+
# judge which cuisine is not satisfied
|
198 |
+
for cuisine in question['local_constraint']['cuisine']:
|
199 |
+
if cuisine not in cuisine_set:
|
200 |
+
return False, f"The cuisine {cuisine} is not satisfied."
|
201 |
+
# return False, f"The cuisine should be {question['local_constraint']['cuisine']}."
|
202 |
+
else:
|
203 |
+
return None,None
|
204 |
+
|
205 |
+
|
206 |
+
def is_valid_transportation(question, tested_data):
|
207 |
+
if question['local_constraint']['transportation'] is None:
|
208 |
+
return None,None
|
209 |
+
for i in range(min(question['days'],len(tested_data))):
|
210 |
+
unit = tested_data[i]
|
211 |
+
if unit['transportation'] and unit['transportation'] != '-':
|
212 |
+
value = unit['transportation']
|
213 |
+
if question['local_constraint']['transportation'] == 'no flight' and 'Flight' in value:
|
214 |
+
return False, f"The transportation should not be {question['local_constraint']['transportation']}."
|
215 |
+
elif question['local_constraint']['transportation'] == 'no self-driving' and 'Self-driving' in value:
|
216 |
+
return False, f"The transportation should not be {question['local_constraint']['transportation']}."
|
217 |
+
|
218 |
+
return True, None
|
219 |
+
|
220 |
+
|
221 |
+
def is_valid_room_type(question, tested_data):
|
222 |
+
if question['local_constraint']['room type'] is None:
|
223 |
+
return None,None
|
224 |
+
for i in range(min(question['days'],len(tested_data))):
|
225 |
+
unit = tested_data[i]
|
226 |
+
if unit['accommodation'] and unit['accommodation'] != '-':
|
227 |
+
name, city = get_valid_name_city(unit['accommodation'])
|
228 |
+
res = accommodation.data[(accommodation.data['NAME'].astype(str).str.contains(re.escape(name))) & (accommodation.data['city'] == city)]
|
229 |
+
if len(res) > 0:
|
230 |
+
if question['local_constraint']['room type'] == 'not shared room' and res['room type'].values[0] == 'Shared room':
|
231 |
+
return False, f"The room type should be {question['local_constraint']['room type']}."
|
232 |
+
# "shared room", "not shared room", "private room", "entire room"
|
233 |
+
elif question['local_constraint']['room type'] == 'shared room' and res['room type'].values[0] != 'Shared room':
|
234 |
+
return False, f"The room type should be {question['local_constraint']['room type']}."
|
235 |
+
|
236 |
+
elif question['local_constraint']['room type'] == 'private room' and res['room type'].values[0] != 'Private room':
|
237 |
+
return False, f"The room type should be {question['local_constraint']['room type']}."
|
238 |
+
|
239 |
+
elif question['local_constraint']['room type'] == 'entire room' and res['room type'].values[0] != 'Entire home/apt':
|
240 |
+
return False, f"The room type should be {question['local_constraint']['room type']}."
|
241 |
+
|
242 |
+
return True, None
|
243 |
+
|
244 |
+
|
245 |
+
def evaluation(query_data, tested_data):
|
246 |
+
return_info = {}
|
247 |
+
return_info['valid_cuisine'] = is_valid_cuisine(query_data, tested_data)
|
248 |
+
return_info['valid_room_rule'] = is_valid_room_rule(query_data, tested_data)
|
249 |
+
return_info['valid_transportation'] = is_valid_transportation(query_data, tested_data)
|
250 |
+
return_info['valid_room_type'] = is_valid_room_type(query_data, tested_data)
|
251 |
+
return_info['valid_cost'] = (bool(get_total_cost(query_data, tested_data) <= query_data['budget']), None)
|
252 |
+
return return_info
|
253 |
+
|
254 |
+
def boolean_evaluation(query_data, tested_data):
|
255 |
+
return_info = {}
|
256 |
+
return_info['valid_cuisine'] = is_valid_cuisine(query_data, tested_data)
|
257 |
+
return_info['valid_room_rule'] = is_valid_room_rule(query_data, tested_data)
|
258 |
+
return_info['valid_transportation'] = is_valid_transportation(query_data, tested_data)
|
259 |
+
return_info['valid_room_type'] = is_valid_room_type(query_data, tested_data)
|
260 |
+
return_info['valid_cost'] = (bool(get_total_cost(query_data, tested_data) <= query_data['budget']), None)
|
261 |
+
for key in return_info:
|
262 |
+
if return_info[key][0] == False:
|
263 |
+
print(key)
|
264 |
+
return False
|
265 |
+
return True
|
266 |
+
|
evaluation/scored/1_validation_two-stage_1.jsonl
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"Delivery Rate": 0.8944444444444445, "Commonsense Constraint Micro Pass Rate": 0.6111111111111112, "Commonsense Constraint Macro Pass Rate": 0.027777777777777776, "Hard Constraint Micro Pass Rate": 0.1523809523809524, "Hard Constraint Macro Pass Rate": 0.10555555555555556, "Final Pass Rate": 0.005555555555555556}
|
evaluation/scored/textbox_validation_two-stage_1.jsonl
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"Delivery Rate": 0.8944444444444445, "Commonsense Constraint Micro Pass Rate": 0.6111111111111112, "Commonsense Constraint Macro Pass Rate": 0.027777777777777776, "Hard Constraint Micro Pass Rate": 0.1523809523809524, "Hard Constraint Macro Pass Rate": 0.10555555555555556, "Final Pass Rate": 0.005555555555555556}
|
requirements.txt
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
datasets==2.16.1
|
2 |
+
gradio==3.50.2
|
3 |
+
huggingface-hub==0.20.2
|
tools/__init__.py
ADDED
File without changes
|
tools/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (170 Bytes). View file
|
|
tools/accommodations/.ipynb_checkpoints/test-checkpoint.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tools/accommodations/__init__.py
ADDED
File without changes
|
tools/accommodations/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (185 Bytes). View file
|
|
tools/accommodations/__pycache__/apis.cpython-39.pyc
ADDED
Binary file (1.57 kB). View file
|
|
tools/accommodations/apis.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from pandas import DataFrame
|
3 |
+
from typing import Optional
|
4 |
+
from annotation.src.utils import extract_before_parenthesis
|
5 |
+
|
6 |
+
|
7 |
+
class Accommodations:
|
8 |
+
def __init__(self, path="../database/accommodations/clean_accommodations_2022.csv"):
|
9 |
+
self.path = path
|
10 |
+
self.data = pd.read_csv(self.path).dropna()[['NAME','price','room type', 'house_rules', 'minimum nights', 'maximum occupancy', 'review rate number', 'city']]
|
11 |
+
print("Accommodations loaded.")
|
12 |
+
|
13 |
+
def load_db(self):
|
14 |
+
self.data = pd.read_csv(self.path).dropna()
|
15 |
+
|
16 |
+
def run(self,
|
17 |
+
city: str,
|
18 |
+
) -> DataFrame:
|
19 |
+
"""Search for accommodations by city."""
|
20 |
+
results = self.data[self.data["city"] == city]
|
21 |
+
# results = results[results["date"] == date]
|
22 |
+
# if order == "ascPrice":
|
23 |
+
# results = results.sort_values(by=["price"], ascending=True)
|
24 |
+
# elif order == "descPrice":
|
25 |
+
# results = results.sort_values(by=["price"], ascending=False)
|
26 |
+
# elif order == "ascRate":
|
27 |
+
# results = results.sort_values(by=["review rate number"], ascending=True)
|
28 |
+
# elif order == "descRate":
|
29 |
+
# results = results.sort_values(by=["review rate number"], ascending=False)
|
30 |
+
# elif order == "ascMinumNights":
|
31 |
+
# results = results.sort_values(by=["minimum nights"], ascending=True)
|
32 |
+
# elif order == "descMinumNights":
|
33 |
+
# results = results.sort_values(by=["minimum nights"], ascending=False)
|
34 |
+
# elif order == "ascMaximumOccupancy":
|
35 |
+
# results = results.sort_values(by=["maximum occupancy"], ascending=True)
|
36 |
+
# elif order == "descMaximumOccupancy":
|
37 |
+
# results = results.sort_values(by=["maximum occupancy"], ascending=False)
|
38 |
+
|
39 |
+
# if room_type == "all":
|
40 |
+
# return results
|
41 |
+
# elif room_type == "Entire home/apt":
|
42 |
+
# return results[results["room type"]=="Entire home/apt"]
|
43 |
+
# elif room_type == "Hotel room":
|
44 |
+
# return results[results["room type"]=="Hotel room"]
|
45 |
+
# elif room_type == "Private room":
|
46 |
+
# return results[results["room type"]=="Private room"]
|
47 |
+
# elif room_type == "Shared room":
|
48 |
+
# return results[results["room type"]=="Shared room"]
|
49 |
+
# else:
|
50 |
+
# return None
|
51 |
+
if len(results) == 0:
|
52 |
+
return "There is no attraction in this city."
|
53 |
+
|
54 |
+
return results
|
55 |
+
|
56 |
+
def run_for_annotation(self,
|
57 |
+
city: str,
|
58 |
+
) -> DataFrame:
|
59 |
+
"""Search for accommodations by city."""
|
60 |
+
results = self.data[self.data["city"] == extract_before_parenthesis(city)]
|
61 |
+
# results = results[results["date"] == date]
|
62 |
+
# if order == "ascPrice":
|
63 |
+
# results = results.sort_values(by=["price"], ascending=True)
|
64 |
+
# elif order == "descPrice":
|
65 |
+
# results = results.sort_values(by=["price"], ascending=False)
|
66 |
+
# elif order == "ascRate":
|
67 |
+
# results = results.sort_values(by=["review rate number"], ascending=True)
|
68 |
+
# elif order == "descRate":
|
69 |
+
# results = results.sort_values(by=["review rate number"], ascending=False)
|
70 |
+
# elif order == "ascMinumNights":
|
71 |
+
# results = results.sort_values(by=["minimum nights"], ascending=True)
|
72 |
+
# elif order == "descMinumNights":
|
73 |
+
# results = results.sort_values(by=["minimum nights"], ascending=False)
|
74 |
+
# elif order == "ascMaximumOccupancy":
|
75 |
+
# results = results.sort_values(by=["maximum occupancy"], ascending=True)
|
76 |
+
# elif order == "descMaximumOccupancy":
|
77 |
+
# results = results.sort_values(by=["maximum occupancy"], ascending=False)
|
78 |
+
|
79 |
+
# if room_type == "all":
|
80 |
+
# return results
|
81 |
+
# elif room_type == "Entire home/apt":
|
82 |
+
# return results[results["room type"]=="Entire home/apt"]
|
83 |
+
# elif room_type == "Hotel room":
|
84 |
+
# return results[results["room type"]=="Hotel room"]
|
85 |
+
# elif room_type == "Private room":
|
86 |
+
# return results[results["room type"]=="Private room"]
|
87 |
+
# elif room_type == "Shared room":
|
88 |
+
# return results[results["room type"]=="Shared room"]
|
89 |
+
# else:
|
90 |
+
# return None
|
91 |
+
return results
|
tools/accommodations/test.ipynb
ADDED
@@ -0,0 +1,2037 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 1,
|
6 |
+
"id": "ad7592e7",
|
7 |
+
"metadata": {},
|
8 |
+
"outputs": [
|
9 |
+
{
|
10 |
+
"name": "stderr",
|
11 |
+
"output_type": "stream",
|
12 |
+
"text": [
|
13 |
+
"/tmp/ipykernel_2459435/230780042.py:2: DtypeWarning: Columns (25) have mixed types. Specify dtype option on import or set low_memory=False.\n",
|
14 |
+
" data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/Airbnb_Open_Data.csv')\n"
|
15 |
+
]
|
16 |
+
}
|
17 |
+
],
|
18 |
+
"source": [
|
19 |
+
"import pandas as pd\n",
|
20 |
+
"data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/Airbnb_Open_Data.csv')"
|
21 |
+
]
|
22 |
+
},
|
23 |
+
{
|
24 |
+
"cell_type": "code",
|
25 |
+
"execution_count": 2,
|
26 |
+
"id": "f97916a9",
|
27 |
+
"metadata": {},
|
28 |
+
"outputs": [
|
29 |
+
{
|
30 |
+
"data": {
|
31 |
+
"text/html": [
|
32 |
+
"<div>\n",
|
33 |
+
"<style scoped>\n",
|
34 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
35 |
+
" vertical-align: middle;\n",
|
36 |
+
" }\n",
|
37 |
+
"\n",
|
38 |
+
" .dataframe tbody tr th {\n",
|
39 |
+
" vertical-align: top;\n",
|
40 |
+
" }\n",
|
41 |
+
"\n",
|
42 |
+
" .dataframe thead th {\n",
|
43 |
+
" text-align: right;\n",
|
44 |
+
" }\n",
|
45 |
+
"</style>\n",
|
46 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
47 |
+
" <thead>\n",
|
48 |
+
" <tr style=\"text-align: right;\">\n",
|
49 |
+
" <th></th>\n",
|
50 |
+
" <th>id</th>\n",
|
51 |
+
" <th>NAME</th>\n",
|
52 |
+
" <th>host id</th>\n",
|
53 |
+
" <th>host_identity_verified</th>\n",
|
54 |
+
" <th>host name</th>\n",
|
55 |
+
" <th>neighbourhood group</th>\n",
|
56 |
+
" <th>neighbourhood</th>\n",
|
57 |
+
" <th>lat</th>\n",
|
58 |
+
" <th>long</th>\n",
|
59 |
+
" <th>country</th>\n",
|
60 |
+
" <th>...</th>\n",
|
61 |
+
" <th>service fee</th>\n",
|
62 |
+
" <th>minimum nights</th>\n",
|
63 |
+
" <th>number of reviews</th>\n",
|
64 |
+
" <th>last review</th>\n",
|
65 |
+
" <th>reviews per month</th>\n",
|
66 |
+
" <th>review rate number</th>\n",
|
67 |
+
" <th>calculated host listings count</th>\n",
|
68 |
+
" <th>availability 365</th>\n",
|
69 |
+
" <th>house_rules</th>\n",
|
70 |
+
" <th>license</th>\n",
|
71 |
+
" </tr>\n",
|
72 |
+
" </thead>\n",
|
73 |
+
" <tbody>\n",
|
74 |
+
" <tr>\n",
|
75 |
+
" <th>0</th>\n",
|
76 |
+
" <td>1001254</td>\n",
|
77 |
+
" <td>Clean & quiet apt home by the park</td>\n",
|
78 |
+
" <td>80014485718</td>\n",
|
79 |
+
" <td>unconfirmed</td>\n",
|
80 |
+
" <td>Madaline</td>\n",
|
81 |
+
" <td>Brooklyn</td>\n",
|
82 |
+
" <td>Kensington</td>\n",
|
83 |
+
" <td>40.64749</td>\n",
|
84 |
+
" <td>-73.97237</td>\n",
|
85 |
+
" <td>United States</td>\n",
|
86 |
+
" <td>...</td>\n",
|
87 |
+
" <td>$193</td>\n",
|
88 |
+
" <td>10.0</td>\n",
|
89 |
+
" <td>9.0</td>\n",
|
90 |
+
" <td>10/19/2021</td>\n",
|
91 |
+
" <td>0.21</td>\n",
|
92 |
+
" <td>4.0</td>\n",
|
93 |
+
" <td>6.0</td>\n",
|
94 |
+
" <td>286.0</td>\n",
|
95 |
+
" <td>Clean up and treat the home the way you'd like...</td>\n",
|
96 |
+
" <td>NaN</td>\n",
|
97 |
+
" </tr>\n",
|
98 |
+
" <tr>\n",
|
99 |
+
" <th>1</th>\n",
|
100 |
+
" <td>1002102</td>\n",
|
101 |
+
" <td>Skylit Midtown Castle</td>\n",
|
102 |
+
" <td>52335172823</td>\n",
|
103 |
+
" <td>verified</td>\n",
|
104 |
+
" <td>Jenna</td>\n",
|
105 |
+
" <td>Manhattan</td>\n",
|
106 |
+
" <td>Midtown</td>\n",
|
107 |
+
" <td>40.75362</td>\n",
|
108 |
+
" <td>-73.98377</td>\n",
|
109 |
+
" <td>United States</td>\n",
|
110 |
+
" <td>...</td>\n",
|
111 |
+
" <td>$28</td>\n",
|
112 |
+
" <td>30.0</td>\n",
|
113 |
+
" <td>45.0</td>\n",
|
114 |
+
" <td>5/21/2022</td>\n",
|
115 |
+
" <td>0.38</td>\n",
|
116 |
+
" <td>4.0</td>\n",
|
117 |
+
" <td>2.0</td>\n",
|
118 |
+
" <td>228.0</td>\n",
|
119 |
+
" <td>Pet friendly but please confirm with me if the...</td>\n",
|
120 |
+
" <td>NaN</td>\n",
|
121 |
+
" </tr>\n",
|
122 |
+
" <tr>\n",
|
123 |
+
" <th>2</th>\n",
|
124 |
+
" <td>1002403</td>\n",
|
125 |
+
" <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
|
126 |
+
" <td>78829239556</td>\n",
|
127 |
+
" <td>NaN</td>\n",
|
128 |
+
" <td>Elise</td>\n",
|
129 |
+
" <td>Manhattan</td>\n",
|
130 |
+
" <td>Harlem</td>\n",
|
131 |
+
" <td>40.80902</td>\n",
|
132 |
+
" <td>-73.94190</td>\n",
|
133 |
+
" <td>United States</td>\n",
|
134 |
+
" <td>...</td>\n",
|
135 |
+
" <td>$124</td>\n",
|
136 |
+
" <td>3.0</td>\n",
|
137 |
+
" <td>0.0</td>\n",
|
138 |
+
" <td>NaN</td>\n",
|
139 |
+
" <td>NaN</td>\n",
|
140 |
+
" <td>5.0</td>\n",
|
141 |
+
" <td>1.0</td>\n",
|
142 |
+
" <td>352.0</td>\n",
|
143 |
+
" <td>I encourage you to use my kitchen, cooking and...</td>\n",
|
144 |
+
" <td>NaN</td>\n",
|
145 |
+
" </tr>\n",
|
146 |
+
" <tr>\n",
|
147 |
+
" <th>3</th>\n",
|
148 |
+
" <td>1002755</td>\n",
|
149 |
+
" <td>NaN</td>\n",
|
150 |
+
" <td>85098326012</td>\n",
|
151 |
+
" <td>unconfirmed</td>\n",
|
152 |
+
" <td>Garry</td>\n",
|
153 |
+
" <td>Brooklyn</td>\n",
|
154 |
+
" <td>Clinton Hill</td>\n",
|
155 |
+
" <td>40.68514</td>\n",
|
156 |
+
" <td>-73.95976</td>\n",
|
157 |
+
" <td>United States</td>\n",
|
158 |
+
" <td>...</td>\n",
|
159 |
+
" <td>$74</td>\n",
|
160 |
+
" <td>30.0</td>\n",
|
161 |
+
" <td>270.0</td>\n",
|
162 |
+
" <td>7/5/2019</td>\n",
|
163 |
+
" <td>4.64</td>\n",
|
164 |
+
" <td>4.0</td>\n",
|
165 |
+
" <td>1.0</td>\n",
|
166 |
+
" <td>322.0</td>\n",
|
167 |
+
" <td>NaN</td>\n",
|
168 |
+
" <td>NaN</td>\n",
|
169 |
+
" </tr>\n",
|
170 |
+
" <tr>\n",
|
171 |
+
" <th>4</th>\n",
|
172 |
+
" <td>1003689</td>\n",
|
173 |
+
" <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
|
174 |
+
" <td>92037596077</td>\n",
|
175 |
+
" <td>verified</td>\n",
|
176 |
+
" <td>Lyndon</td>\n",
|
177 |
+
" <td>Manhattan</td>\n",
|
178 |
+
" <td>East Harlem</td>\n",
|
179 |
+
" <td>40.79851</td>\n",
|
180 |
+
" <td>-73.94399</td>\n",
|
181 |
+
" <td>United States</td>\n",
|
182 |
+
" <td>...</td>\n",
|
183 |
+
" <td>$41</td>\n",
|
184 |
+
" <td>10.0</td>\n",
|
185 |
+
" <td>9.0</td>\n",
|
186 |
+
" <td>11/19/2018</td>\n",
|
187 |
+
" <td>0.10</td>\n",
|
188 |
+
" <td>3.0</td>\n",
|
189 |
+
" <td>1.0</td>\n",
|
190 |
+
" <td>289.0</td>\n",
|
191 |
+
" <td>Please no smoking in the house, porch or on th...</td>\n",
|
192 |
+
" <td>NaN</td>\n",
|
193 |
+
" </tr>\n",
|
194 |
+
" <tr>\n",
|
195 |
+
" <th>...</th>\n",
|
196 |
+
" <td>...</td>\n",
|
197 |
+
" <td>...</td>\n",
|
198 |
+
" <td>...</td>\n",
|
199 |
+
" <td>...</td>\n",
|
200 |
+
" <td>...</td>\n",
|
201 |
+
" <td>...</td>\n",
|
202 |
+
" <td>...</td>\n",
|
203 |
+
" <td>...</td>\n",
|
204 |
+
" <td>...</td>\n",
|
205 |
+
" <td>...</td>\n",
|
206 |
+
" <td>...</td>\n",
|
207 |
+
" <td>...</td>\n",
|
208 |
+
" <td>...</td>\n",
|
209 |
+
" <td>...</td>\n",
|
210 |
+
" <td>...</td>\n",
|
211 |
+
" <td>...</td>\n",
|
212 |
+
" <td>...</td>\n",
|
213 |
+
" <td>...</td>\n",
|
214 |
+
" <td>...</td>\n",
|
215 |
+
" <td>...</td>\n",
|
216 |
+
" <td>...</td>\n",
|
217 |
+
" </tr>\n",
|
218 |
+
" <tr>\n",
|
219 |
+
" <th>102594</th>\n",
|
220 |
+
" <td>6092437</td>\n",
|
221 |
+
" <td>Spare room in Williamsburg</td>\n",
|
222 |
+
" <td>12312296767</td>\n",
|
223 |
+
" <td>verified</td>\n",
|
224 |
+
" <td>Krik</td>\n",
|
225 |
+
" <td>Brooklyn</td>\n",
|
226 |
+
" <td>Williamsburg</td>\n",
|
227 |
+
" <td>40.70862</td>\n",
|
228 |
+
" <td>-73.94651</td>\n",
|
229 |
+
" <td>United States</td>\n",
|
230 |
+
" <td>...</td>\n",
|
231 |
+
" <td>$169</td>\n",
|
232 |
+
" <td>1.0</td>\n",
|
233 |
+
" <td>0.0</td>\n",
|
234 |
+
" <td>NaN</td>\n",
|
235 |
+
" <td>NaN</td>\n",
|
236 |
+
" <td>3.0</td>\n",
|
237 |
+
" <td>1.0</td>\n",
|
238 |
+
" <td>227.0</td>\n",
|
239 |
+
" <td>No Smoking No Parties or Events of any kind Pl...</td>\n",
|
240 |
+
" <td>NaN</td>\n",
|
241 |
+
" </tr>\n",
|
242 |
+
" <tr>\n",
|
243 |
+
" <th>102595</th>\n",
|
244 |
+
" <td>6092990</td>\n",
|
245 |
+
" <td>Best Location near Columbia U</td>\n",
|
246 |
+
" <td>77864383453</td>\n",
|
247 |
+
" <td>unconfirmed</td>\n",
|
248 |
+
" <td>Mifan</td>\n",
|
249 |
+
" <td>Manhattan</td>\n",
|
250 |
+
" <td>Morningside Heights</td>\n",
|
251 |
+
" <td>40.80460</td>\n",
|
252 |
+
" <td>-73.96545</td>\n",
|
253 |
+
" <td>United States</td>\n",
|
254 |
+
" <td>...</td>\n",
|
255 |
+
" <td>$167</td>\n",
|
256 |
+
" <td>1.0</td>\n",
|
257 |
+
" <td>1.0</td>\n",
|
258 |
+
" <td>7/6/2015</td>\n",
|
259 |
+
" <td>0.02</td>\n",
|
260 |
+
" <td>2.0</td>\n",
|
261 |
+
" <td>2.0</td>\n",
|
262 |
+
" <td>395.0</td>\n",
|
263 |
+
" <td>House rules: Guests agree to the following ter...</td>\n",
|
264 |
+
" <td>NaN</td>\n",
|
265 |
+
" </tr>\n",
|
266 |
+
" <tr>\n",
|
267 |
+
" <th>102596</th>\n",
|
268 |
+
" <td>6093542</td>\n",
|
269 |
+
" <td>Comfy, bright room in Brooklyn</td>\n",
|
270 |
+
" <td>69050334417</td>\n",
|
271 |
+
" <td>unconfirmed</td>\n",
|
272 |
+
" <td>Megan</td>\n",
|
273 |
+
" <td>Brooklyn</td>\n",
|
274 |
+
" <td>Park Slope</td>\n",
|
275 |
+
" <td>40.67505</td>\n",
|
276 |
+
" <td>-73.98045</td>\n",
|
277 |
+
" <td>United States</td>\n",
|
278 |
+
" <td>...</td>\n",
|
279 |
+
" <td>$198</td>\n",
|
280 |
+
" <td>3.0</td>\n",
|
281 |
+
" <td>0.0</td>\n",
|
282 |
+
" <td>NaN</td>\n",
|
283 |
+
" <td>NaN</td>\n",
|
284 |
+
" <td>5.0</td>\n",
|
285 |
+
" <td>1.0</td>\n",
|
286 |
+
" <td>342.0</td>\n",
|
287 |
+
" <td>NaN</td>\n",
|
288 |
+
" <td>NaN</td>\n",
|
289 |
+
" </tr>\n",
|
290 |
+
" <tr>\n",
|
291 |
+
" <th>102597</th>\n",
|
292 |
+
" <td>6094094</td>\n",
|
293 |
+
" <td>Big Studio-One Stop from Midtown</td>\n",
|
294 |
+
" <td>11160591270</td>\n",
|
295 |
+
" <td>unconfirmed</td>\n",
|
296 |
+
" <td>Christopher</td>\n",
|
297 |
+
" <td>Queens</td>\n",
|
298 |
+
" <td>Long Island City</td>\n",
|
299 |
+
" <td>40.74989</td>\n",
|
300 |
+
" <td>-73.93777</td>\n",
|
301 |
+
" <td>United States</td>\n",
|
302 |
+
" <td>...</td>\n",
|
303 |
+
" <td>$109</td>\n",
|
304 |
+
" <td>2.0</td>\n",
|
305 |
+
" <td>5.0</td>\n",
|
306 |
+
" <td>10/11/2015</td>\n",
|
307 |
+
" <td>0.10</td>\n",
|
308 |
+
" <td>3.0</td>\n",
|
309 |
+
" <td>1.0</td>\n",
|
310 |
+
" <td>386.0</td>\n",
|
311 |
+
" <td>NaN</td>\n",
|
312 |
+
" <td>NaN</td>\n",
|
313 |
+
" </tr>\n",
|
314 |
+
" <tr>\n",
|
315 |
+
" <th>102598</th>\n",
|
316 |
+
" <td>6094647</td>\n",
|
317 |
+
" <td>585 sf Luxury Studio</td>\n",
|
318 |
+
" <td>68170633372</td>\n",
|
319 |
+
" <td>unconfirmed</td>\n",
|
320 |
+
" <td>Rebecca</td>\n",
|
321 |
+
" <td>Manhattan</td>\n",
|
322 |
+
" <td>Upper West Side</td>\n",
|
323 |
+
" <td>40.76807</td>\n",
|
324 |
+
" <td>-73.98342</td>\n",
|
325 |
+
" <td>United States</td>\n",
|
326 |
+
" <td>...</td>\n",
|
327 |
+
" <td>$206</td>\n",
|
328 |
+
" <td>1.0</td>\n",
|
329 |
+
" <td>0.0</td>\n",
|
330 |
+
" <td>NaN</td>\n",
|
331 |
+
" <td>NaN</td>\n",
|
332 |
+
" <td>3.0</td>\n",
|
333 |
+
" <td>1.0</td>\n",
|
334 |
+
" <td>69.0</td>\n",
|
335 |
+
" <td>NaN</td>\n",
|
336 |
+
" <td>NaN</td>\n",
|
337 |
+
" </tr>\n",
|
338 |
+
" </tbody>\n",
|
339 |
+
"</table>\n",
|
340 |
+
"<p>102599 rows × 26 columns</p>\n",
|
341 |
+
"</div>"
|
342 |
+
],
|
343 |
+
"text/plain": [
|
344 |
+
" id NAME \n",
|
345 |
+
"0 1001254 Clean & quiet apt home by the park \\\n",
|
346 |
+
"1 1002102 Skylit Midtown Castle \n",
|
347 |
+
"2 1002403 THE VILLAGE OF HARLEM....NEW YORK ! \n",
|
348 |
+
"3 1002755 NaN \n",
|
349 |
+
"4 1003689 Entire Apt: Spacious Studio/Loft by central park \n",
|
350 |
+
"... ... ... \n",
|
351 |
+
"102594 6092437 Spare room in Williamsburg \n",
|
352 |
+
"102595 6092990 Best Location near Columbia U \n",
|
353 |
+
"102596 6093542 Comfy, bright room in Brooklyn \n",
|
354 |
+
"102597 6094094 Big Studio-One Stop from Midtown \n",
|
355 |
+
"102598 6094647 585 sf Luxury Studio \n",
|
356 |
+
"\n",
|
357 |
+
" host id host_identity_verified host name neighbourhood group \n",
|
358 |
+
"0 80014485718 unconfirmed Madaline Brooklyn \\\n",
|
359 |
+
"1 52335172823 verified Jenna Manhattan \n",
|
360 |
+
"2 78829239556 NaN Elise Manhattan \n",
|
361 |
+
"3 85098326012 unconfirmed Garry Brooklyn \n",
|
362 |
+
"4 92037596077 verified Lyndon Manhattan \n",
|
363 |
+
"... ... ... ... ... \n",
|
364 |
+
"102594 12312296767 verified Krik Brooklyn \n",
|
365 |
+
"102595 77864383453 unconfirmed Mifan Manhattan \n",
|
366 |
+
"102596 69050334417 unconfirmed Megan Brooklyn \n",
|
367 |
+
"102597 11160591270 unconfirmed Christopher Queens \n",
|
368 |
+
"102598 68170633372 unconfirmed Rebecca Manhattan \n",
|
369 |
+
"\n",
|
370 |
+
" neighbourhood lat long country ... \n",
|
371 |
+
"0 Kensington 40.64749 -73.97237 United States ... \\\n",
|
372 |
+
"1 Midtown 40.75362 -73.98377 United States ... \n",
|
373 |
+
"2 Harlem 40.80902 -73.94190 United States ... \n",
|
374 |
+
"3 Clinton Hill 40.68514 -73.95976 United States ... \n",
|
375 |
+
"4 East Harlem 40.79851 -73.94399 United States ... \n",
|
376 |
+
"... ... ... ... ... ... \n",
|
377 |
+
"102594 Williamsburg 40.70862 -73.94651 United States ... \n",
|
378 |
+
"102595 Morningside Heights 40.80460 -73.96545 United States ... \n",
|
379 |
+
"102596 Park Slope 40.67505 -73.98045 United States ... \n",
|
380 |
+
"102597 Long Island City 40.74989 -73.93777 United States ... \n",
|
381 |
+
"102598 Upper West Side 40.76807 -73.98342 United States ... \n",
|
382 |
+
"\n",
|
383 |
+
" service fee minimum nights number of reviews last review \n",
|
384 |
+
"0 $193 10.0 9.0 10/19/2021 \\\n",
|
385 |
+
"1 $28 30.0 45.0 5/21/2022 \n",
|
386 |
+
"2 $124 3.0 0.0 NaN \n",
|
387 |
+
"3 $74 30.0 270.0 7/5/2019 \n",
|
388 |
+
"4 $41 10.0 9.0 11/19/2018 \n",
|
389 |
+
"... ... ... ... ... \n",
|
390 |
+
"102594 $169 1.0 0.0 NaN \n",
|
391 |
+
"102595 $167 1.0 1.0 7/6/2015 \n",
|
392 |
+
"102596 $198 3.0 0.0 NaN \n",
|
393 |
+
"102597 $109 2.0 5.0 10/11/2015 \n",
|
394 |
+
"102598 $206 1.0 0.0 NaN \n",
|
395 |
+
"\n",
|
396 |
+
" reviews per month review rate number calculated host listings count \n",
|
397 |
+
"0 0.21 4.0 6.0 \\\n",
|
398 |
+
"1 0.38 4.0 2.0 \n",
|
399 |
+
"2 NaN 5.0 1.0 \n",
|
400 |
+
"3 4.64 4.0 1.0 \n",
|
401 |
+
"4 0.10 3.0 1.0 \n",
|
402 |
+
"... ... ... ... \n",
|
403 |
+
"102594 NaN 3.0 1.0 \n",
|
404 |
+
"102595 0.02 2.0 2.0 \n",
|
405 |
+
"102596 NaN 5.0 1.0 \n",
|
406 |
+
"102597 0.10 3.0 1.0 \n",
|
407 |
+
"102598 NaN 3.0 1.0 \n",
|
408 |
+
"\n",
|
409 |
+
" availability 365 house_rules \n",
|
410 |
+
"0 286.0 Clean up and treat the home the way you'd like... \\\n",
|
411 |
+
"1 228.0 Pet friendly but please confirm with me if the... \n",
|
412 |
+
"2 352.0 I encourage you to use my kitchen, cooking and... \n",
|
413 |
+
"3 322.0 NaN \n",
|
414 |
+
"4 289.0 Please no smoking in the house, porch or on th... \n",
|
415 |
+
"... ... ... \n",
|
416 |
+
"102594 227.0 No Smoking No Parties or Events of any kind Pl... \n",
|
417 |
+
"102595 395.0 House rules: Guests agree to the following ter... \n",
|
418 |
+
"102596 342.0 NaN \n",
|
419 |
+
"102597 386.0 NaN \n",
|
420 |
+
"102598 69.0 NaN \n",
|
421 |
+
"\n",
|
422 |
+
" license \n",
|
423 |
+
"0 NaN \n",
|
424 |
+
"1 NaN \n",
|
425 |
+
"2 NaN \n",
|
426 |
+
"3 NaN \n",
|
427 |
+
"4 NaN \n",
|
428 |
+
"... ... \n",
|
429 |
+
"102594 NaN \n",
|
430 |
+
"102595 NaN \n",
|
431 |
+
"102596 NaN \n",
|
432 |
+
"102597 NaN \n",
|
433 |
+
"102598 NaN \n",
|
434 |
+
"\n",
|
435 |
+
"[102599 rows x 26 columns]"
|
436 |
+
]
|
437 |
+
},
|
438 |
+
"execution_count": 2,
|
439 |
+
"metadata": {},
|
440 |
+
"output_type": "execute_result"
|
441 |
+
}
|
442 |
+
],
|
443 |
+
"source": [
|
444 |
+
"data"
|
445 |
+
]
|
446 |
+
},
|
447 |
+
{
|
448 |
+
"cell_type": "code",
|
449 |
+
"execution_count": 3,
|
450 |
+
"id": "e21af5d1",
|
451 |
+
"metadata": {},
|
452 |
+
"outputs": [],
|
453 |
+
"source": [
|
454 |
+
"flight = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/flights/clean_Flights_2022.csv')"
|
455 |
+
]
|
456 |
+
},
|
457 |
+
{
|
458 |
+
"cell_type": "code",
|
459 |
+
"execution_count": 4,
|
460 |
+
"id": "966feef9",
|
461 |
+
"metadata": {},
|
462 |
+
"outputs": [],
|
463 |
+
"source": [
|
464 |
+
"flight = flight.to_dict(orient = 'split')"
|
465 |
+
]
|
466 |
+
},
|
467 |
+
{
|
468 |
+
"cell_type": "code",
|
469 |
+
"execution_count": 5,
|
470 |
+
"id": "3f4fe062",
|
471 |
+
"metadata": {},
|
472 |
+
"outputs": [],
|
473 |
+
"source": [
|
474 |
+
"data_dict = data.to_dict(orient = 'split')"
|
475 |
+
]
|
476 |
+
},
|
477 |
+
{
|
478 |
+
"cell_type": "code",
|
479 |
+
"execution_count": 6,
|
480 |
+
"id": "33213ac0",
|
481 |
+
"metadata": {},
|
482 |
+
"outputs": [
|
483 |
+
{
|
484 |
+
"data": {
|
485 |
+
"text/plain": [
|
486 |
+
"[2, '2022-04-04', '15:14', '16:36', 251.0, 'Durango', 'Denver', 100]"
|
487 |
+
]
|
488 |
+
},
|
489 |
+
"execution_count": 6,
|
490 |
+
"metadata": {},
|
491 |
+
"output_type": "execute_result"
|
492 |
+
}
|
493 |
+
],
|
494 |
+
"source": [
|
495 |
+
"flight['data'][2]"
|
496 |
+
]
|
497 |
+
},
|
498 |
+
{
|
499 |
+
"cell_type": "code",
|
500 |
+
"execution_count": 8,
|
501 |
+
"id": "9cef6161",
|
502 |
+
"metadata": {},
|
503 |
+
"outputs": [
|
504 |
+
{
|
505 |
+
"name": "stdout",
|
506 |
+
"output_type": "stream",
|
507 |
+
"text": [
|
508 |
+
"nan\n"
|
509 |
+
]
|
510 |
+
}
|
511 |
+
],
|
512 |
+
"source": [
|
513 |
+
"print(str(data_dict['data'][3][24]))"
|
514 |
+
]
|
515 |
+
},
|
516 |
+
{
|
517 |
+
"cell_type": "code",
|
518 |
+
"execution_count": 9,
|
519 |
+
"id": "c5f81f43",
|
520 |
+
"metadata": {},
|
521 |
+
"outputs": [],
|
522 |
+
"source": [
|
523 |
+
"city_set = set()\n",
|
524 |
+
"cnt = 0\n",
|
525 |
+
"for unit in data_dict['data']:\n",
|
526 |
+
" if str(unit[24]) != 'nan':\n",
|
527 |
+
" cnt += 1"
|
528 |
+
]
|
529 |
+
},
|
530 |
+
{
|
531 |
+
"cell_type": "code",
|
532 |
+
"execution_count": 10,
|
533 |
+
"id": "533a5aa6",
|
534 |
+
"metadata": {},
|
535 |
+
"outputs": [
|
536 |
+
{
|
537 |
+
"data": {
|
538 |
+
"text/plain": [
|
539 |
+
"50468"
|
540 |
+
]
|
541 |
+
},
|
542 |
+
"execution_count": 10,
|
543 |
+
"metadata": {},
|
544 |
+
"output_type": "execute_result"
|
545 |
+
}
|
546 |
+
],
|
547 |
+
"source": [
|
548 |
+
"cnt"
|
549 |
+
]
|
550 |
+
},
|
551 |
+
{
|
552 |
+
"cell_type": "code",
|
553 |
+
"execution_count": 11,
|
554 |
+
"id": "bfce5f56",
|
555 |
+
"metadata": {},
|
556 |
+
"outputs": [
|
557 |
+
{
|
558 |
+
"data": {
|
559 |
+
"text/plain": [
|
560 |
+
"set()"
|
561 |
+
]
|
562 |
+
},
|
563 |
+
"execution_count": 11,
|
564 |
+
"metadata": {},
|
565 |
+
"output_type": "execute_result"
|
566 |
+
}
|
567 |
+
],
|
568 |
+
"source": [
|
569 |
+
"city_set"
|
570 |
+
]
|
571 |
+
},
|
572 |
+
{
|
573 |
+
"cell_type": "code",
|
574 |
+
"execution_count": 12,
|
575 |
+
"id": "230b760c",
|
576 |
+
"metadata": {},
|
577 |
+
"outputs": [
|
578 |
+
{
|
579 |
+
"ename": "ValueError",
|
580 |
+
"evalue": "Sample larger than population or is negative",
|
581 |
+
"output_type": "error",
|
582 |
+
"traceback": [
|
583 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
584 |
+
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
585 |
+
"Cell \u001b[0;32mIn[12], line 3\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mrandom\u001b[39;00m\n\u001b[1;32m 2\u001b[0m city_set \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(city_set)\n\u001b[0;32m----> 3\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[43mrandom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcity_set\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m)\n",
|
586 |
+
"File \u001b[0;32m~/miniconda3/envs/py39/lib/python3.9/random.py:449\u001b[0m, in \u001b[0;36mRandom.sample\u001b[0;34m(self, population, k, counts)\u001b[0m\n\u001b[1;32m 447\u001b[0m randbelow \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_randbelow\n\u001b[1;32m 448\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m0\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m k \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m n:\n\u001b[0;32m--> 449\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSample larger than population or is negative\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 450\u001b[0m result \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;01mNone\u001b[39;00m] \u001b[38;5;241m*\u001b[39m k\n\u001b[1;32m 451\u001b[0m setsize \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m21\u001b[39m \u001b[38;5;66;03m# size of a small set minus size of an empty list\u001b[39;00m\n",
|
587 |
+
"\u001b[0;31mValueError\u001b[0m: Sample larger than population or is negative"
|
588 |
+
]
|
589 |
+
}
|
590 |
+
],
|
591 |
+
"source": [
|
592 |
+
"import random\n",
|
593 |
+
"city_set = list(city_set)\n",
|
594 |
+
"print(random.sample(city_set,1))"
|
595 |
+
]
|
596 |
+
},
|
597 |
+
{
|
598 |
+
"cell_type": "code",
|
599 |
+
"execution_count": 12,
|
600 |
+
"id": "61eddd5f",
|
601 |
+
"metadata": {},
|
602 |
+
"outputs": [
|
603 |
+
{
|
604 |
+
"ename": "AttributeError",
|
605 |
+
"evalue": "'dict' object has no attribute 'to_dict'",
|
606 |
+
"output_type": "error",
|
607 |
+
"traceback": [
|
608 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
609 |
+
"\u001b[0;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
610 |
+
"Cell \u001b[0;32mIn[12], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m data_dict \u001b[38;5;241m=\u001b[39m \u001b[43mdata\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mto_dict\u001b[49m(orient \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m'\u001b[39m\u001b[38;5;124msplit\u001b[39m\u001b[38;5;124m'\u001b[39m)\n",
|
611 |
+
"\u001b[0;31mAttributeError\u001b[0m: 'dict' object has no attribute 'to_dict'"
|
612 |
+
]
|
613 |
+
}
|
614 |
+
],
|
615 |
+
"source": [
|
616 |
+
"data_dict = data.to_dict(orient = 'split')"
|
617 |
+
]
|
618 |
+
},
|
619 |
+
{
|
620 |
+
"cell_type": "code",
|
621 |
+
"execution_count": 35,
|
622 |
+
"id": "3292c450",
|
623 |
+
"metadata": {},
|
624 |
+
"outputs": [
|
625 |
+
{
|
626 |
+
"data": {
|
627 |
+
"text/plain": [
|
628 |
+
"['Unnamed: 0',\n",
|
629 |
+
" 'NAME',\n",
|
630 |
+
" 'room type',\n",
|
631 |
+
" 'price',\n",
|
632 |
+
" 'minimum nights',\n",
|
633 |
+
" 'review rate number',\n",
|
634 |
+
" 'house_rules',\n",
|
635 |
+
" 'maximum occupancy',\n",
|
636 |
+
" 'city']"
|
637 |
+
]
|
638 |
+
},
|
639 |
+
"execution_count": 35,
|
640 |
+
"metadata": {},
|
641 |
+
"output_type": "execute_result"
|
642 |
+
}
|
643 |
+
],
|
644 |
+
"source": [
|
645 |
+
"data_dict['columns']"
|
646 |
+
]
|
647 |
+
},
|
648 |
+
{
|
649 |
+
"cell_type": "code",
|
650 |
+
"execution_count": 38,
|
651 |
+
"id": "cfaa21d9",
|
652 |
+
"metadata": {},
|
653 |
+
"outputs": [
|
654 |
+
{
|
655 |
+
"data": {
|
656 |
+
"text/plain": [
|
657 |
+
"5047"
|
658 |
+
]
|
659 |
+
},
|
660 |
+
"execution_count": 38,
|
661 |
+
"metadata": {},
|
662 |
+
"output_type": "execute_result"
|
663 |
+
}
|
664 |
+
],
|
665 |
+
"source": [
|
666 |
+
"len(data_dict['data'])"
|
667 |
+
]
|
668 |
+
},
|
669 |
+
{
|
670 |
+
"cell_type": "code",
|
671 |
+
"execution_count": 36,
|
672 |
+
"id": "2980362d",
|
673 |
+
"metadata": {},
|
674 |
+
"outputs": [],
|
675 |
+
"source": [
|
676 |
+
"type_set = set()\n",
|
677 |
+
"for unit in data_dict['data']:\n",
|
678 |
+
" type_set.add(unit[2])"
|
679 |
+
]
|
680 |
+
},
|
681 |
+
{
|
682 |
+
"cell_type": "code",
|
683 |
+
"execution_count": 37,
|
684 |
+
"id": "f5e36fbb",
|
685 |
+
"metadata": {},
|
686 |
+
"outputs": [
|
687 |
+
{
|
688 |
+
"data": {
|
689 |
+
"text/plain": [
|
690 |
+
"{'Entire home/apt', 'Private room', 'Shared room'}"
|
691 |
+
]
|
692 |
+
},
|
693 |
+
"execution_count": 37,
|
694 |
+
"metadata": {},
|
695 |
+
"output_type": "execute_result"
|
696 |
+
}
|
697 |
+
],
|
698 |
+
"source": [
|
699 |
+
"type_set"
|
700 |
+
]
|
701 |
+
},
|
702 |
+
{
|
703 |
+
"cell_type": "code",
|
704 |
+
"execution_count": 15,
|
705 |
+
"id": "bf1231c4",
|
706 |
+
"metadata": {},
|
707 |
+
"outputs": [
|
708 |
+
{
|
709 |
+
"ename": "NameError",
|
710 |
+
"evalue": "name 'data_dict' is not defined",
|
711 |
+
"output_type": "error",
|
712 |
+
"traceback": [
|
713 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
714 |
+
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
715 |
+
"Cell \u001b[0;32mIn[15], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mdata_dict\u001b[49m[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mdata\u001b[39m\u001b[38;5;124m'\u001b[39m][\u001b[38;5;241m147\u001b[39m]\n",
|
716 |
+
"\u001b[0;31mNameError\u001b[0m: name 'data_dict' is not defined"
|
717 |
+
]
|
718 |
+
}
|
719 |
+
],
|
720 |
+
"source": [
|
721 |
+
"data_dict['data'][147]"
|
722 |
+
]
|
723 |
+
},
|
724 |
+
{
|
725 |
+
"cell_type": "code",
|
726 |
+
"execution_count": 14,
|
727 |
+
"id": "f993b894",
|
728 |
+
"metadata": {},
|
729 |
+
"outputs": [
|
730 |
+
{
|
731 |
+
"data": {
|
732 |
+
"text/plain": [
|
733 |
+
"set()"
|
734 |
+
]
|
735 |
+
},
|
736 |
+
"execution_count": 14,
|
737 |
+
"metadata": {},
|
738 |
+
"output_type": "execute_result"
|
739 |
+
}
|
740 |
+
],
|
741 |
+
"source": [
|
742 |
+
"type_set"
|
743 |
+
]
|
744 |
+
},
|
745 |
+
{
|
746 |
+
"cell_type": "code",
|
747 |
+
"execution_count": 10,
|
748 |
+
"id": "916e9470",
|
749 |
+
"metadata": {},
|
750 |
+
"outputs": [
|
751 |
+
{
|
752 |
+
"name": "stdout",
|
753 |
+
"output_type": "stream",
|
754 |
+
"text": [
|
755 |
+
"1 NAME\n",
|
756 |
+
"7 lat\n",
|
757 |
+
"8 long\n",
|
758 |
+
"13 room type\n",
|
759 |
+
"15 price\n",
|
760 |
+
"17 minimum nights\n",
|
761 |
+
"21 review rate number\n",
|
762 |
+
"24 house_rules\n"
|
763 |
+
]
|
764 |
+
}
|
765 |
+
],
|
766 |
+
"source": [
|
767 |
+
"for idx, unit in enumerate(data_dict['columns']):\n",
|
768 |
+
" if unit in ['NAME','lat', 'long', 'room type', 'price','minimum nights','review rate number','house_rules']:\n",
|
769 |
+
" print(idx,unit)"
|
770 |
+
]
|
771 |
+
},
|
772 |
+
{
|
773 |
+
"cell_type": "code",
|
774 |
+
"execution_count": 73,
|
775 |
+
"id": "1213484d",
|
776 |
+
"metadata": {},
|
777 |
+
"outputs": [
|
778 |
+
{
|
779 |
+
"data": {
|
780 |
+
"application/vnd.jupyter.widget-view+json": {
|
781 |
+
"model_id": "51764c1a3739416289913ec613816cc7",
|
782 |
+
"version_major": 2,
|
783 |
+
"version_minor": 0
|
784 |
+
},
|
785 |
+
"text/plain": [
|
786 |
+
"0it [00:00, ?it/s]"
|
787 |
+
]
|
788 |
+
},
|
789 |
+
"metadata": {},
|
790 |
+
"output_type": "display_data"
|
791 |
+
},
|
792 |
+
{
|
793 |
+
"name": "stderr",
|
794 |
+
"output_type": "stream",
|
795 |
+
"text": [
|
796 |
+
"/tmp/ipykernel_3241846/557604333.py:23: DeprecationWarning: Sampling from a set deprecated\n",
|
797 |
+
"since Python 3.9 and will be removed in a subsequent version.\n",
|
798 |
+
" tmp_dict[\"city\"] = random.sample(city_set,1)[0]\n"
|
799 |
+
]
|
800 |
+
},
|
801 |
+
{
|
802 |
+
"ename": "ValueError",
|
803 |
+
"evalue": "Sample larger than population or is negative",
|
804 |
+
"output_type": "error",
|
805 |
+
"traceback": [
|
806 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
807 |
+
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
808 |
+
"Cell \u001b[0;32mIn[73], line 23\u001b[0m\n\u001b[1;32m 21\u001b[0m tmp_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mreview rate number\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m unit[\u001b[38;5;241m21\u001b[39m]\n\u001b[1;32m 22\u001b[0m tmp_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mhouse_rules\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m unit[\u001b[38;5;241m24\u001b[39m]\n\u001b[0;32m---> 23\u001b[0m tmp_dict[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcity\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43mrandom\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msample\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcity_set\u001b[49m\u001b[43m,\u001b[49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m)\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 24\u001b[0m new_data\u001b[38;5;241m.\u001b[39mappend(tmp_dict)\n",
|
809 |
+
"File \u001b[0;32m~/miniconda3/envs/py39/lib/python3.9/random.py:449\u001b[0m, in \u001b[0;36mRandom.sample\u001b[0;34m(self, population, k, counts)\u001b[0m\n\u001b[1;32m 447\u001b[0m randbelow \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_randbelow\n\u001b[1;32m 448\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;241m0\u001b[39m \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m k \u001b[38;5;241m<\u001b[39m\u001b[38;5;241m=\u001b[39m n:\n\u001b[0;32m--> 449\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mValueError\u001b[39;00m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mSample larger than population or is negative\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 450\u001b[0m result \u001b[38;5;241m=\u001b[39m [\u001b[38;5;28;01mNone\u001b[39;00m] \u001b[38;5;241m*\u001b[39m k\n\u001b[1;32m 451\u001b[0m setsize \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m21\u001b[39m \u001b[38;5;66;03m# size of a small set minus size of an empty list\u001b[39;00m\n",
|
810 |
+
"\u001b[0;31mValueError\u001b[0m: Sample larger than population or is negative"
|
811 |
+
]
|
812 |
+
}
|
813 |
+
],
|
814 |
+
"source": [
|
815 |
+
"from tqdm.autonotebook import tqdm\n",
|
816 |
+
"import random\n",
|
817 |
+
"new_data = []\n",
|
818 |
+
"for idx, unit in tqdm(enumerate(data_dict['data'])):\n",
|
819 |
+
" tmp_dict = {k:\"\" for k in ['NAME','room type', 'price','minimum nights','review rate number','house_rules']}\n",
|
820 |
+
" tmp_dict[\"NAME\"] = unit[1]\n",
|
821 |
+
" tmp_dict[\"room type\"] = unit[13]\n",
|
822 |
+
" if unit[13] == \"Shared room\":\n",
|
823 |
+
" tmp_dict[\"maximum occupancy\"] = 1\n",
|
824 |
+
" elif unit[13] == \"Hotel room\":\n",
|
825 |
+
" tmp_dict[\"maximum occupancy\"] = random.randint(1, 2)\n",
|
826 |
+
" elif unit[13] == \"Private room\":\n",
|
827 |
+
" tmp_dict[\"maximum occupancy\"] = random.randint(1, 2)\n",
|
828 |
+
" elif unit[13] == \"Entire home/apt\":\n",
|
829 |
+
" try:\n",
|
830 |
+
" tmp_dict[\"maximum occupancy\"] = random.randint(2, max(3,eval(unit[15].replace(\"$\",\"\").replace(\",\",\"\"))//100))\n",
|
831 |
+
" except:\n",
|
832 |
+
" tmp_dict[\"maximum occupancy\"] = random.randint(2, max(3,unit[15]//100))\n",
|
833 |
+
" tmp_dict[\"price\"] = unit[15].replace(\"$\",\"\").replace(\",\",\"\")\n",
|
834 |
+
" tmp_dict[\"minimum nights\"] = unit[17]\n",
|
835 |
+
" tmp_dict[\"review rate number\"] = unit[21]\n",
|
836 |
+
" tmp_dict[\"house_rules\"] = unit[24]\n",
|
837 |
+
" tmp_dict[\"city\"] = random.sample(city_set,1)[0]\n",
|
838 |
+
" new_data.append(tmp_dict)"
|
839 |
+
]
|
840 |
+
},
|
841 |
+
{
|
842 |
+
"cell_type": "code",
|
843 |
+
"execution_count": 20,
|
844 |
+
"id": "fd3e8257",
|
845 |
+
"metadata": {},
|
846 |
+
"outputs": [
|
847 |
+
{
|
848 |
+
"data": {
|
849 |
+
"text/plain": [
|
850 |
+
"102599"
|
851 |
+
]
|
852 |
+
},
|
853 |
+
"execution_count": 20,
|
854 |
+
"metadata": {},
|
855 |
+
"output_type": "execute_result"
|
856 |
+
}
|
857 |
+
],
|
858 |
+
"source": [
|
859 |
+
"len(new_data)"
|
860 |
+
]
|
861 |
+
},
|
862 |
+
{
|
863 |
+
"cell_type": "code",
|
864 |
+
"execution_count": 21,
|
865 |
+
"id": "bfb243c0",
|
866 |
+
"metadata": {},
|
867 |
+
"outputs": [],
|
868 |
+
"source": [
|
869 |
+
"df = pd.DataFrame(new_data)"
|
870 |
+
]
|
871 |
+
},
|
872 |
+
{
|
873 |
+
"cell_type": "code",
|
874 |
+
"execution_count": 23,
|
875 |
+
"id": "af7e3411",
|
876 |
+
"metadata": {},
|
877 |
+
"outputs": [],
|
878 |
+
"source": [
|
879 |
+
"df.to_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
|
880 |
+
]
|
881 |
+
},
|
882 |
+
{
|
883 |
+
"cell_type": "code",
|
884 |
+
"execution_count": 22,
|
885 |
+
"id": "71d21fea",
|
886 |
+
"metadata": {},
|
887 |
+
"outputs": [
|
888 |
+
{
|
889 |
+
"data": {
|
890 |
+
"text/html": [
|
891 |
+
"<div>\n",
|
892 |
+
"<style scoped>\n",
|
893 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
894 |
+
" vertical-align: middle;\n",
|
895 |
+
" }\n",
|
896 |
+
"\n",
|
897 |
+
" .dataframe tbody tr th {\n",
|
898 |
+
" vertical-align: top;\n",
|
899 |
+
" }\n",
|
900 |
+
"\n",
|
901 |
+
" .dataframe thead th {\n",
|
902 |
+
" text-align: right;\n",
|
903 |
+
" }\n",
|
904 |
+
"</style>\n",
|
905 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
906 |
+
" <thead>\n",
|
907 |
+
" <tr style=\"text-align: right;\">\n",
|
908 |
+
" <th></th>\n",
|
909 |
+
" <th>NAME</th>\n",
|
910 |
+
" <th>room type</th>\n",
|
911 |
+
" <th>price</th>\n",
|
912 |
+
" <th>minimum nights</th>\n",
|
913 |
+
" <th>review rate number</th>\n",
|
914 |
+
" <th>house_rules</th>\n",
|
915 |
+
" <th>maximum occupancy</th>\n",
|
916 |
+
" <th>city</th>\n",
|
917 |
+
" </tr>\n",
|
918 |
+
" </thead>\n",
|
919 |
+
" <tbody>\n",
|
920 |
+
" <tr>\n",
|
921 |
+
" <th>0</th>\n",
|
922 |
+
" <td>Clean & quiet apt home by the park</td>\n",
|
923 |
+
" <td>Private room</td>\n",
|
924 |
+
" <td>$966</td>\n",
|
925 |
+
" <td>10.0</td>\n",
|
926 |
+
" <td>4.0</td>\n",
|
927 |
+
" <td>Clean up and treat the home the way you'd like...</td>\n",
|
928 |
+
" <td>1</td>\n",
|
929 |
+
" <td>Des Moines</td>\n",
|
930 |
+
" </tr>\n",
|
931 |
+
" <tr>\n",
|
932 |
+
" <th>1</th>\n",
|
933 |
+
" <td>Skylit Midtown Castle</td>\n",
|
934 |
+
" <td>Entire home/apt</td>\n",
|
935 |
+
" <td>$142</td>\n",
|
936 |
+
" <td>30.0</td>\n",
|
937 |
+
" <td>4.0</td>\n",
|
938 |
+
" <td>Pet friendly but please confirm with me if the...</td>\n",
|
939 |
+
" <td>2</td>\n",
|
940 |
+
" <td>Wilmington</td>\n",
|
941 |
+
" </tr>\n",
|
942 |
+
" <tr>\n",
|
943 |
+
" <th>2</th>\n",
|
944 |
+
" <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
|
945 |
+
" <td>Private room</td>\n",
|
946 |
+
" <td>$620</td>\n",
|
947 |
+
" <td>3.0</td>\n",
|
948 |
+
" <td>5.0</td>\n",
|
949 |
+
" <td>I encourage you to use my kitchen, cooking and...</td>\n",
|
950 |
+
" <td>2</td>\n",
|
951 |
+
" <td>St. George</td>\n",
|
952 |
+
" </tr>\n",
|
953 |
+
" <tr>\n",
|
954 |
+
" <th>3</th>\n",
|
955 |
+
" <td>NaN</td>\n",
|
956 |
+
" <td>Entire home/apt</td>\n",
|
957 |
+
" <td>$368</td>\n",
|
958 |
+
" <td>30.0</td>\n",
|
959 |
+
" <td>4.0</td>\n",
|
960 |
+
" <td>NaN</td>\n",
|
961 |
+
" <td>2</td>\n",
|
962 |
+
" <td>Kalamazoo</td>\n",
|
963 |
+
" </tr>\n",
|
964 |
+
" <tr>\n",
|
965 |
+
" <th>4</th>\n",
|
966 |
+
" <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
|
967 |
+
" <td>Entire home/apt</td>\n",
|
968 |
+
" <td>$204</td>\n",
|
969 |
+
" <td>10.0</td>\n",
|
970 |
+
" <td>3.0</td>\n",
|
971 |
+
" <td>Please no smoking in the house, porch or on th...</td>\n",
|
972 |
+
" <td>3</td>\n",
|
973 |
+
" <td>Cheyenne</td>\n",
|
974 |
+
" </tr>\n",
|
975 |
+
" <tr>\n",
|
976 |
+
" <th>...</th>\n",
|
977 |
+
" <td>...</td>\n",
|
978 |
+
" <td>...</td>\n",
|
979 |
+
" <td>...</td>\n",
|
980 |
+
" <td>...</td>\n",
|
981 |
+
" <td>...</td>\n",
|
982 |
+
" <td>...</td>\n",
|
983 |
+
" <td>...</td>\n",
|
984 |
+
" <td>...</td>\n",
|
985 |
+
" </tr>\n",
|
986 |
+
" <tr>\n",
|
987 |
+
" <th>102594</th>\n",
|
988 |
+
" <td>Spare room in Williamsburg</td>\n",
|
989 |
+
" <td>Private room</td>\n",
|
990 |
+
" <td>$844</td>\n",
|
991 |
+
" <td>1.0</td>\n",
|
992 |
+
" <td>3.0</td>\n",
|
993 |
+
" <td>No Smoking No Parties or Events of any kind Pl...</td>\n",
|
994 |
+
" <td>1</td>\n",
|
995 |
+
" <td>White Plains</td>\n",
|
996 |
+
" </tr>\n",
|
997 |
+
" <tr>\n",
|
998 |
+
" <th>102595</th>\n",
|
999 |
+
" <td>Best Location near Columbia U</td>\n",
|
1000 |
+
" <td>Private room</td>\n",
|
1001 |
+
" <td>$837</td>\n",
|
1002 |
+
" <td>1.0</td>\n",
|
1003 |
+
" <td>2.0</td>\n",
|
1004 |
+
" <td>House rules: Guests agree to the following ter...</td>\n",
|
1005 |
+
" <td>2</td>\n",
|
1006 |
+
" <td>Mosinee</td>\n",
|
1007 |
+
" </tr>\n",
|
1008 |
+
" <tr>\n",
|
1009 |
+
" <th>102596</th>\n",
|
1010 |
+
" <td>Comfy, bright room in Brooklyn</td>\n",
|
1011 |
+
" <td>Private room</td>\n",
|
1012 |
+
" <td>$988</td>\n",
|
1013 |
+
" <td>3.0</td>\n",
|
1014 |
+
" <td>5.0</td>\n",
|
1015 |
+
" <td>NaN</td>\n",
|
1016 |
+
" <td>2</td>\n",
|
1017 |
+
" <td>Amarillo</td>\n",
|
1018 |
+
" </tr>\n",
|
1019 |
+
" <tr>\n",
|
1020 |
+
" <th>102597</th>\n",
|
1021 |
+
" <td>Big Studio-One Stop from Midtown</td>\n",
|
1022 |
+
" <td>Entire home/apt</td>\n",
|
1023 |
+
" <td>$546</td>\n",
|
1024 |
+
" <td>2.0</td>\n",
|
1025 |
+
" <td>3.0</td>\n",
|
1026 |
+
" <td>NaN</td>\n",
|
1027 |
+
" <td>4</td>\n",
|
1028 |
+
" <td>Binghamton</td>\n",
|
1029 |
+
" </tr>\n",
|
1030 |
+
" <tr>\n",
|
1031 |
+
" <th>102598</th>\n",
|
1032 |
+
" <td>585 sf Luxury Studio</td>\n",
|
1033 |
+
" <td>Entire home/apt</td>\n",
|
1034 |
+
" <td>$1,032</td>\n",
|
1035 |
+
" <td>1.0</td>\n",
|
1036 |
+
" <td>3.0</td>\n",
|
1037 |
+
" <td>NaN</td>\n",
|
1038 |
+
" <td>7</td>\n",
|
1039 |
+
" <td>Flint</td>\n",
|
1040 |
+
" </tr>\n",
|
1041 |
+
" </tbody>\n",
|
1042 |
+
"</table>\n",
|
1043 |
+
"<p>102599 rows × 8 columns</p>\n",
|
1044 |
+
"</div>"
|
1045 |
+
],
|
1046 |
+
"text/plain": [
|
1047 |
+
" NAME room type \n",
|
1048 |
+
"0 Clean & quiet apt home by the park Private room \\\n",
|
1049 |
+
"1 Skylit Midtown Castle Entire home/apt \n",
|
1050 |
+
"2 THE VILLAGE OF HARLEM....NEW YORK ! Private room \n",
|
1051 |
+
"3 NaN Entire home/apt \n",
|
1052 |
+
"4 Entire Apt: Spacious Studio/Loft by central park Entire home/apt \n",
|
1053 |
+
"... ... ... \n",
|
1054 |
+
"102594 Spare room in Williamsburg Private room \n",
|
1055 |
+
"102595 Best Location near Columbia U Private room \n",
|
1056 |
+
"102596 Comfy, bright room in Brooklyn Private room \n",
|
1057 |
+
"102597 Big Studio-One Stop from Midtown Entire home/apt \n",
|
1058 |
+
"102598 585 sf Luxury Studio Entire home/apt \n",
|
1059 |
+
"\n",
|
1060 |
+
" price minimum nights review rate number \n",
|
1061 |
+
"0 $966 10.0 4.0 \\\n",
|
1062 |
+
"1 $142 30.0 4.0 \n",
|
1063 |
+
"2 $620 3.0 5.0 \n",
|
1064 |
+
"3 $368 30.0 4.0 \n",
|
1065 |
+
"4 $204 10.0 3.0 \n",
|
1066 |
+
"... ... ... ... \n",
|
1067 |
+
"102594 $844 1.0 3.0 \n",
|
1068 |
+
"102595 $837 1.0 2.0 \n",
|
1069 |
+
"102596 $988 3.0 5.0 \n",
|
1070 |
+
"102597 $546 2.0 3.0 \n",
|
1071 |
+
"102598 $1,032 1.0 3.0 \n",
|
1072 |
+
"\n",
|
1073 |
+
" house_rules maximum occupancy \n",
|
1074 |
+
"0 Clean up and treat the home the way you'd like... 1 \\\n",
|
1075 |
+
"1 Pet friendly but please confirm with me if the... 2 \n",
|
1076 |
+
"2 I encourage you to use my kitchen, cooking and... 2 \n",
|
1077 |
+
"3 NaN 2 \n",
|
1078 |
+
"4 Please no smoking in the house, porch or on th... 3 \n",
|
1079 |
+
"... ... ... \n",
|
1080 |
+
"102594 No Smoking No Parties or Events of any kind Pl... 1 \n",
|
1081 |
+
"102595 House rules: Guests agree to the following ter... 2 \n",
|
1082 |
+
"102596 NaN 2 \n",
|
1083 |
+
"102597 NaN 4 \n",
|
1084 |
+
"102598 NaN 7 \n",
|
1085 |
+
"\n",
|
1086 |
+
" city \n",
|
1087 |
+
"0 Des Moines \n",
|
1088 |
+
"1 Wilmington \n",
|
1089 |
+
"2 St. George \n",
|
1090 |
+
"3 Kalamazoo \n",
|
1091 |
+
"4 Cheyenne \n",
|
1092 |
+
"... ... \n",
|
1093 |
+
"102594 White Plains \n",
|
1094 |
+
"102595 Mosinee \n",
|
1095 |
+
"102596 Amarillo \n",
|
1096 |
+
"102597 Binghamton \n",
|
1097 |
+
"102598 Flint \n",
|
1098 |
+
"\n",
|
1099 |
+
"[102599 rows x 8 columns]"
|
1100 |
+
]
|
1101 |
+
},
|
1102 |
+
"execution_count": 22,
|
1103 |
+
"metadata": {},
|
1104 |
+
"output_type": "execute_result"
|
1105 |
+
}
|
1106 |
+
],
|
1107 |
+
"source": [
|
1108 |
+
"df"
|
1109 |
+
]
|
1110 |
+
},
|
1111 |
+
{
|
1112 |
+
"cell_type": "code",
|
1113 |
+
"execution_count": 50,
|
1114 |
+
"id": "0ec56283",
|
1115 |
+
"metadata": {},
|
1116 |
+
"outputs": [],
|
1117 |
+
"source": [
|
1118 |
+
"import pandas as pd\n",
|
1119 |
+
"data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
|
1120 |
+
]
|
1121 |
+
},
|
1122 |
+
{
|
1123 |
+
"cell_type": "code",
|
1124 |
+
"execution_count": 52,
|
1125 |
+
"id": "5dc27048",
|
1126 |
+
"metadata": {},
|
1127 |
+
"outputs": [
|
1128 |
+
{
|
1129 |
+
"data": {
|
1130 |
+
"text/html": [
|
1131 |
+
"<div>\n",
|
1132 |
+
"<style scoped>\n",
|
1133 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
1134 |
+
" vertical-align: middle;\n",
|
1135 |
+
" }\n",
|
1136 |
+
"\n",
|
1137 |
+
" .dataframe tbody tr th {\n",
|
1138 |
+
" vertical-align: top;\n",
|
1139 |
+
" }\n",
|
1140 |
+
"\n",
|
1141 |
+
" .dataframe thead th {\n",
|
1142 |
+
" text-align: right;\n",
|
1143 |
+
" }\n",
|
1144 |
+
"</style>\n",
|
1145 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
1146 |
+
" <thead>\n",
|
1147 |
+
" <tr style=\"text-align: right;\">\n",
|
1148 |
+
" <th></th>\n",
|
1149 |
+
" <th>Unnamed: 0</th>\n",
|
1150 |
+
" <th>NAME</th>\n",
|
1151 |
+
" <th>room type</th>\n",
|
1152 |
+
" <th>price</th>\n",
|
1153 |
+
" <th>minimum nights</th>\n",
|
1154 |
+
" <th>review rate number</th>\n",
|
1155 |
+
" <th>house_rules</th>\n",
|
1156 |
+
" <th>maximum occupancy</th>\n",
|
1157 |
+
" <th>city</th>\n",
|
1158 |
+
" </tr>\n",
|
1159 |
+
" </thead>\n",
|
1160 |
+
" <tbody>\n",
|
1161 |
+
" <tr>\n",
|
1162 |
+
" <th>0</th>\n",
|
1163 |
+
" <td>0</td>\n",
|
1164 |
+
" <td>Clean & quiet apt home by the park</td>\n",
|
1165 |
+
" <td>Private room</td>\n",
|
1166 |
+
" <td>$966</td>\n",
|
1167 |
+
" <td>10.0</td>\n",
|
1168 |
+
" <td>4.0</td>\n",
|
1169 |
+
" <td>Clean up and treat the home the way you'd like...</td>\n",
|
1170 |
+
" <td>1</td>\n",
|
1171 |
+
" <td>Des Moines</td>\n",
|
1172 |
+
" </tr>\n",
|
1173 |
+
" <tr>\n",
|
1174 |
+
" <th>1</th>\n",
|
1175 |
+
" <td>1</td>\n",
|
1176 |
+
" <td>Skylit Midtown Castle</td>\n",
|
1177 |
+
" <td>Entire home/apt</td>\n",
|
1178 |
+
" <td>$142</td>\n",
|
1179 |
+
" <td>30.0</td>\n",
|
1180 |
+
" <td>4.0</td>\n",
|
1181 |
+
" <td>Pet friendly but please confirm with me if the...</td>\n",
|
1182 |
+
" <td>2</td>\n",
|
1183 |
+
" <td>Wilmington</td>\n",
|
1184 |
+
" </tr>\n",
|
1185 |
+
" <tr>\n",
|
1186 |
+
" <th>2</th>\n",
|
1187 |
+
" <td>2</td>\n",
|
1188 |
+
" <td>THE VILLAGE OF HARLEM....NEW YORK !</td>\n",
|
1189 |
+
" <td>Private room</td>\n",
|
1190 |
+
" <td>$620</td>\n",
|
1191 |
+
" <td>3.0</td>\n",
|
1192 |
+
" <td>5.0</td>\n",
|
1193 |
+
" <td>I encourage you to use my kitchen, cooking and...</td>\n",
|
1194 |
+
" <td>2</td>\n",
|
1195 |
+
" <td>St. George</td>\n",
|
1196 |
+
" </tr>\n",
|
1197 |
+
" <tr>\n",
|
1198 |
+
" <th>3</th>\n",
|
1199 |
+
" <td>3</td>\n",
|
1200 |
+
" <td>NaN</td>\n",
|
1201 |
+
" <td>Entire home/apt</td>\n",
|
1202 |
+
" <td>$368</td>\n",
|
1203 |
+
" <td>30.0</td>\n",
|
1204 |
+
" <td>4.0</td>\n",
|
1205 |
+
" <td>NaN</td>\n",
|
1206 |
+
" <td>2</td>\n",
|
1207 |
+
" <td>Kalamazoo</td>\n",
|
1208 |
+
" </tr>\n",
|
1209 |
+
" <tr>\n",
|
1210 |
+
" <th>4</th>\n",
|
1211 |
+
" <td>4</td>\n",
|
1212 |
+
" <td>Entire Apt: Spacious Studio/Loft by central park</td>\n",
|
1213 |
+
" <td>Entire home/apt</td>\n",
|
1214 |
+
" <td>$204</td>\n",
|
1215 |
+
" <td>10.0</td>\n",
|
1216 |
+
" <td>3.0</td>\n",
|
1217 |
+
" <td>Please no smoking in the house, porch or on th...</td>\n",
|
1218 |
+
" <td>3</td>\n",
|
1219 |
+
" <td>Cheyenne</td>\n",
|
1220 |
+
" </tr>\n",
|
1221 |
+
" <tr>\n",
|
1222 |
+
" <th>...</th>\n",
|
1223 |
+
" <td>...</td>\n",
|
1224 |
+
" <td>...</td>\n",
|
1225 |
+
" <td>...</td>\n",
|
1226 |
+
" <td>...</td>\n",
|
1227 |
+
" <td>...</td>\n",
|
1228 |
+
" <td>...</td>\n",
|
1229 |
+
" <td>...</td>\n",
|
1230 |
+
" <td>...</td>\n",
|
1231 |
+
" <td>...</td>\n",
|
1232 |
+
" </tr>\n",
|
1233 |
+
" <tr>\n",
|
1234 |
+
" <th>102594</th>\n",
|
1235 |
+
" <td>102594</td>\n",
|
1236 |
+
" <td>Spare room in Williamsburg</td>\n",
|
1237 |
+
" <td>Private room</td>\n",
|
1238 |
+
" <td>$844</td>\n",
|
1239 |
+
" <td>1.0</td>\n",
|
1240 |
+
" <td>3.0</td>\n",
|
1241 |
+
" <td>No Smoking No Parties or Events of any kind Pl...</td>\n",
|
1242 |
+
" <td>1</td>\n",
|
1243 |
+
" <td>White Plains</td>\n",
|
1244 |
+
" </tr>\n",
|
1245 |
+
" <tr>\n",
|
1246 |
+
" <th>102595</th>\n",
|
1247 |
+
" <td>102595</td>\n",
|
1248 |
+
" <td>Best Location near Columbia U</td>\n",
|
1249 |
+
" <td>Private room</td>\n",
|
1250 |
+
" <td>$837</td>\n",
|
1251 |
+
" <td>1.0</td>\n",
|
1252 |
+
" <td>2.0</td>\n",
|
1253 |
+
" <td>House rules: Guests agree to the following ter...</td>\n",
|
1254 |
+
" <td>2</td>\n",
|
1255 |
+
" <td>Mosinee</td>\n",
|
1256 |
+
" </tr>\n",
|
1257 |
+
" <tr>\n",
|
1258 |
+
" <th>102596</th>\n",
|
1259 |
+
" <td>102596</td>\n",
|
1260 |
+
" <td>Comfy, bright room in Brooklyn</td>\n",
|
1261 |
+
" <td>Private room</td>\n",
|
1262 |
+
" <td>$988</td>\n",
|
1263 |
+
" <td>3.0</td>\n",
|
1264 |
+
" <td>5.0</td>\n",
|
1265 |
+
" <td>NaN</td>\n",
|
1266 |
+
" <td>2</td>\n",
|
1267 |
+
" <td>Amarillo</td>\n",
|
1268 |
+
" </tr>\n",
|
1269 |
+
" <tr>\n",
|
1270 |
+
" <th>102597</th>\n",
|
1271 |
+
" <td>102597</td>\n",
|
1272 |
+
" <td>Big Studio-One Stop from Midtown</td>\n",
|
1273 |
+
" <td>Entire home/apt</td>\n",
|
1274 |
+
" <td>$546</td>\n",
|
1275 |
+
" <td>2.0</td>\n",
|
1276 |
+
" <td>3.0</td>\n",
|
1277 |
+
" <td>NaN</td>\n",
|
1278 |
+
" <td>4</td>\n",
|
1279 |
+
" <td>Binghamton</td>\n",
|
1280 |
+
" </tr>\n",
|
1281 |
+
" <tr>\n",
|
1282 |
+
" <th>102598</th>\n",
|
1283 |
+
" <td>102598</td>\n",
|
1284 |
+
" <td>585 sf Luxury Studio</td>\n",
|
1285 |
+
" <td>Entire home/apt</td>\n",
|
1286 |
+
" <td>$1,032</td>\n",
|
1287 |
+
" <td>1.0</td>\n",
|
1288 |
+
" <td>3.0</td>\n",
|
1289 |
+
" <td>NaN</td>\n",
|
1290 |
+
" <td>7</td>\n",
|
1291 |
+
" <td>Flint</td>\n",
|
1292 |
+
" </tr>\n",
|
1293 |
+
" </tbody>\n",
|
1294 |
+
"</table>\n",
|
1295 |
+
"<p>102599 rows × 9 columns</p>\n",
|
1296 |
+
"</div>"
|
1297 |
+
],
|
1298 |
+
"text/plain": [
|
1299 |
+
" Unnamed: 0 NAME \n",
|
1300 |
+
"0 0 Clean & quiet apt home by the park \\\n",
|
1301 |
+
"1 1 Skylit Midtown Castle \n",
|
1302 |
+
"2 2 THE VILLAGE OF HARLEM....NEW YORK ! \n",
|
1303 |
+
"3 3 NaN \n",
|
1304 |
+
"4 4 Entire Apt: Spacious Studio/Loft by central park \n",
|
1305 |
+
"... ... ... \n",
|
1306 |
+
"102594 102594 Spare room in Williamsburg \n",
|
1307 |
+
"102595 102595 Best Location near Columbia U \n",
|
1308 |
+
"102596 102596 Comfy, bright room in Brooklyn \n",
|
1309 |
+
"102597 102597 Big Studio-One Stop from Midtown \n",
|
1310 |
+
"102598 102598 585 sf Luxury Studio \n",
|
1311 |
+
"\n",
|
1312 |
+
" room type price minimum nights review rate number \n",
|
1313 |
+
"0 Private room $966 10.0 4.0 \\\n",
|
1314 |
+
"1 Entire home/apt $142 30.0 4.0 \n",
|
1315 |
+
"2 Private room $620 3.0 5.0 \n",
|
1316 |
+
"3 Entire home/apt $368 30.0 4.0 \n",
|
1317 |
+
"4 Entire home/apt $204 10.0 3.0 \n",
|
1318 |
+
"... ... ... ... ... \n",
|
1319 |
+
"102594 Private room $844 1.0 3.0 \n",
|
1320 |
+
"102595 Private room $837 1.0 2.0 \n",
|
1321 |
+
"102596 Private room $988 3.0 5.0 \n",
|
1322 |
+
"102597 Entire home/apt $546 2.0 3.0 \n",
|
1323 |
+
"102598 Entire home/apt $1,032 1.0 3.0 \n",
|
1324 |
+
"\n",
|
1325 |
+
" house_rules maximum occupancy \n",
|
1326 |
+
"0 Clean up and treat the home the way you'd like... 1 \\\n",
|
1327 |
+
"1 Pet friendly but please confirm with me if the... 2 \n",
|
1328 |
+
"2 I encourage you to use my kitchen, cooking and... 2 \n",
|
1329 |
+
"3 NaN 2 \n",
|
1330 |
+
"4 Please no smoking in the house, porch or on th... 3 \n",
|
1331 |
+
"... ... ... \n",
|
1332 |
+
"102594 No Smoking No Parties or Events of any kind Pl... 1 \n",
|
1333 |
+
"102595 House rules: Guests agree to the following ter... 2 \n",
|
1334 |
+
"102596 NaN 2 \n",
|
1335 |
+
"102597 NaN 4 \n",
|
1336 |
+
"102598 NaN 7 \n",
|
1337 |
+
"\n",
|
1338 |
+
" city \n",
|
1339 |
+
"0 Des Moines \n",
|
1340 |
+
"1 Wilmington \n",
|
1341 |
+
"2 St. George \n",
|
1342 |
+
"3 Kalamazoo \n",
|
1343 |
+
"4 Cheyenne \n",
|
1344 |
+
"... ... \n",
|
1345 |
+
"102594 White Plains \n",
|
1346 |
+
"102595 Mosinee \n",
|
1347 |
+
"102596 Amarillo \n",
|
1348 |
+
"102597 Binghamton \n",
|
1349 |
+
"102598 Flint \n",
|
1350 |
+
"\n",
|
1351 |
+
"[102599 rows x 9 columns]"
|
1352 |
+
]
|
1353 |
+
},
|
1354 |
+
"execution_count": 52,
|
1355 |
+
"metadata": {},
|
1356 |
+
"output_type": "execute_result"
|
1357 |
+
}
|
1358 |
+
],
|
1359 |
+
"source": [
|
1360 |
+
"data"
|
1361 |
+
]
|
1362 |
+
},
|
1363 |
+
{
|
1364 |
+
"cell_type": "code",
|
1365 |
+
"execution_count": 63,
|
1366 |
+
"id": "bebb9c93",
|
1367 |
+
"metadata": {},
|
1368 |
+
"outputs": [],
|
1369 |
+
"source": [
|
1370 |
+
"filtered_data = data[data.iloc[:, -3].notna()]"
|
1371 |
+
]
|
1372 |
+
},
|
1373 |
+
{
|
1374 |
+
"cell_type": "code",
|
1375 |
+
"execution_count": 64,
|
1376 |
+
"id": "bd010fc9",
|
1377 |
+
"metadata": {},
|
1378 |
+
"outputs": [],
|
1379 |
+
"source": [
|
1380 |
+
"dict_representation = filtered_data.to_dict(orient='split')"
|
1381 |
+
]
|
1382 |
+
},
|
1383 |
+
{
|
1384 |
+
"cell_type": "code",
|
1385 |
+
"execution_count": 71,
|
1386 |
+
"id": "e84db5c4",
|
1387 |
+
"metadata": {},
|
1388 |
+
"outputs": [
|
1389 |
+
{
|
1390 |
+
"data": {
|
1391 |
+
"text/plain": [
|
1392 |
+
"50468"
|
1393 |
+
]
|
1394 |
+
},
|
1395 |
+
"execution_count": 71,
|
1396 |
+
"metadata": {},
|
1397 |
+
"output_type": "execute_result"
|
1398 |
+
}
|
1399 |
+
],
|
1400 |
+
"source": [
|
1401 |
+
"len(dict_representation['data'])"
|
1402 |
+
]
|
1403 |
+
},
|
1404 |
+
{
|
1405 |
+
"cell_type": "code",
|
1406 |
+
"execution_count": 67,
|
1407 |
+
"id": "31eaadf3",
|
1408 |
+
"metadata": {},
|
1409 |
+
"outputs": [],
|
1410 |
+
"source": [
|
1411 |
+
"sample_df = filtered_data.sample(frac=0.1)"
|
1412 |
+
]
|
1413 |
+
},
|
1414 |
+
{
|
1415 |
+
"cell_type": "code",
|
1416 |
+
"execution_count": 69,
|
1417 |
+
"id": "33998ec6",
|
1418 |
+
"metadata": {},
|
1419 |
+
"outputs": [],
|
1420 |
+
"source": [
|
1421 |
+
"sample_df.to_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
|
1422 |
+
]
|
1423 |
+
},
|
1424 |
+
{
|
1425 |
+
"cell_type": "code",
|
1426 |
+
"execution_count": 72,
|
1427 |
+
"id": "25396015",
|
1428 |
+
"metadata": {},
|
1429 |
+
"outputs": [
|
1430 |
+
{
|
1431 |
+
"data": {
|
1432 |
+
"text/plain": [
|
1433 |
+
"5047"
|
1434 |
+
]
|
1435 |
+
},
|
1436 |
+
"execution_count": 72,
|
1437 |
+
"metadata": {},
|
1438 |
+
"output_type": "execute_result"
|
1439 |
+
}
|
1440 |
+
],
|
1441 |
+
"source": [
|
1442 |
+
"len(sample_df)"
|
1443 |
+
]
|
1444 |
+
},
|
1445 |
+
{
|
1446 |
+
"cell_type": "code",
|
1447 |
+
"execution_count": 3,
|
1448 |
+
"id": "17d054b5",
|
1449 |
+
"metadata": {},
|
1450 |
+
"outputs": [],
|
1451 |
+
"source": [
|
1452 |
+
"import pandas as pd\n",
|
1453 |
+
"data = pd.read_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
|
1454 |
+
]
|
1455 |
+
},
|
1456 |
+
{
|
1457 |
+
"cell_type": "code",
|
1458 |
+
"execution_count": 4,
|
1459 |
+
"id": "64db8d6c",
|
1460 |
+
"metadata": {},
|
1461 |
+
"outputs": [],
|
1462 |
+
"source": [
|
1463 |
+
"data_dict = data.to_dict(orient = 'split')"
|
1464 |
+
]
|
1465 |
+
},
|
1466 |
+
{
|
1467 |
+
"cell_type": "code",
|
1468 |
+
"execution_count": 21,
|
1469 |
+
"id": "b32b2f0c",
|
1470 |
+
"metadata": {},
|
1471 |
+
"outputs": [
|
1472 |
+
{
|
1473 |
+
"name": "stdout",
|
1474 |
+
"output_type": "stream",
|
1475 |
+
"text": [
|
1476 |
+
"0 Unnamed: 0.1\n",
|
1477 |
+
"1 Unnamed: 0\n",
|
1478 |
+
"2 NAME\n",
|
1479 |
+
"3 room type\n",
|
1480 |
+
"4 price\n",
|
1481 |
+
"5 minimum nights\n",
|
1482 |
+
"6 review rate number\n",
|
1483 |
+
"7 house_rules\n",
|
1484 |
+
"8 maximum occupancy\n",
|
1485 |
+
"9 city\n"
|
1486 |
+
]
|
1487 |
+
}
|
1488 |
+
],
|
1489 |
+
"source": [
|
1490 |
+
"for idx, unit in enumerate(data_dict['columns']):\n",
|
1491 |
+
" print(idx,unit)"
|
1492 |
+
]
|
1493 |
+
},
|
1494 |
+
{
|
1495 |
+
"cell_type": "code",
|
1496 |
+
"execution_count": 8,
|
1497 |
+
"id": "fe415c1c",
|
1498 |
+
"metadata": {},
|
1499 |
+
"outputs": [
|
1500 |
+
{
|
1501 |
+
"data": {
|
1502 |
+
"text/plain": [
|
1503 |
+
"[0,\n",
|
1504 |
+
" 'Beautiful room upper manhttn.',\n",
|
1505 |
+
" 'Private room',\n",
|
1506 |
+
" 131.0,\n",
|
1507 |
+
" 1.0,\n",
|
1508 |
+
" 2.0,\n",
|
1509 |
+
" 'No smoking. No pets. ',\n",
|
1510 |
+
" 1,\n",
|
1511 |
+
" 'Christiansted']"
|
1512 |
+
]
|
1513 |
+
},
|
1514 |
+
"execution_count": 8,
|
1515 |
+
"metadata": {},
|
1516 |
+
"output_type": "execute_result"
|
1517 |
+
}
|
1518 |
+
],
|
1519 |
+
"source": [
|
1520 |
+
"data_dict['data'][0]"
|
1521 |
+
]
|
1522 |
+
},
|
1523 |
+
{
|
1524 |
+
"cell_type": "code",
|
1525 |
+
"execution_count": 40,
|
1526 |
+
"id": "38cb5c5a",
|
1527 |
+
"metadata": {},
|
1528 |
+
"outputs": [],
|
1529 |
+
"source": [
|
1530 |
+
"import random\n",
|
1531 |
+
"new_data = []\n",
|
1532 |
+
"for idx, unit in enumerate(data_dict['data']):\n",
|
1533 |
+
" tmp_dict = {k:j for k,j in zip(['NAME','room type', 'price','minimum nights','review rate number','house_rules','maximum occupancy','city'],unit[1:])}\n",
|
1534 |
+
" if type(unit[4]) == str:\n",
|
1535 |
+
" tmp_dict[\"price\"] = eval(unit[4].replace(\"$\",\"\").replace(\",\",\"\"))\n",
|
1536 |
+
" house_rules_number = random.choice([0,1,1,1,2,2,3])\n",
|
1537 |
+
" tmp_dict['house_rules'] = \" & \".join(x for x in random.sample([\"No parties\",\"No smoking\",\"No children under 10\",\"No pets\",\"No visitors\"],house_rules_number))\n",
|
1538 |
+
" tmp_dict['city'] = tmp_dict['city'].split('/')[0]\n",
|
1539 |
+
" new_data.append(tmp_dict)"
|
1540 |
+
]
|
1541 |
+
},
|
1542 |
+
{
|
1543 |
+
"cell_type": "code",
|
1544 |
+
"execution_count": 41,
|
1545 |
+
"id": "ae3d551e",
|
1546 |
+
"metadata": {},
|
1547 |
+
"outputs": [
|
1548 |
+
{
|
1549 |
+
"data": {
|
1550 |
+
"text/plain": [
|
1551 |
+
"{'NAME': 'BIG room with bath & balcony in BK!',\n",
|
1552 |
+
" 'room type': 'Private room',\n",
|
1553 |
+
" 'price': 1123.0,\n",
|
1554 |
+
" 'minimum nights': 1.0,\n",
|
1555 |
+
" 'review rate number': 4.0,\n",
|
1556 |
+
" 'house_rules': 'No parties',\n",
|
1557 |
+
" 'maximum occupancy': 2,\n",
|
1558 |
+
" 'city': 'Louisville'}"
|
1559 |
+
]
|
1560 |
+
},
|
1561 |
+
"execution_count": 41,
|
1562 |
+
"metadata": {},
|
1563 |
+
"output_type": "execute_result"
|
1564 |
+
}
|
1565 |
+
],
|
1566 |
+
"source": [
|
1567 |
+
"new_data[2]"
|
1568 |
+
]
|
1569 |
+
},
|
1570 |
+
{
|
1571 |
+
"cell_type": "code",
|
1572 |
+
"execution_count": 42,
|
1573 |
+
"id": "6fac856c",
|
1574 |
+
"metadata": {},
|
1575 |
+
"outputs": [
|
1576 |
+
{
|
1577 |
+
"name": "stdout",
|
1578 |
+
"output_type": "stream",
|
1579 |
+
"text": [
|
1580 |
+
"\n",
|
1581 |
+
"----------\n",
|
1582 |
+
"No pets & No visitors & No smoking\n",
|
1583 |
+
"----------\n",
|
1584 |
+
"No parties & No visitors\n",
|
1585 |
+
"----------\n",
|
1586 |
+
"No children under 10 & No pets & No smoking\n",
|
1587 |
+
"----------\n",
|
1588 |
+
"No parties & No pets & No visitors\n",
|
1589 |
+
"----------\n",
|
1590 |
+
"No pets & No children under 10\n",
|
1591 |
+
"----------\n",
|
1592 |
+
"No children under 10 & No parties & No pets\n",
|
1593 |
+
"----------\n",
|
1594 |
+
"No visitors\n",
|
1595 |
+
"----------\n",
|
1596 |
+
"No parties & No children under 10\n",
|
1597 |
+
"----------\n",
|
1598 |
+
"No children under 10 & No smoking & No visitors\n",
|
1599 |
+
"----------\n",
|
1600 |
+
"No children under 10 & No parties & No smoking\n",
|
1601 |
+
"----------\n",
|
1602 |
+
"No pets & No smoking & No children under 10\n",
|
1603 |
+
"----------\n",
|
1604 |
+
"No pets & No visitors\n",
|
1605 |
+
"----------\n",
|
1606 |
+
"No visitors & No pets\n",
|
1607 |
+
"----------\n",
|
1608 |
+
"No children under 10 & No smoking & No pets\n",
|
1609 |
+
"----------\n",
|
1610 |
+
"No smoking & No parties & No pets\n",
|
1611 |
+
"----------\n",
|
1612 |
+
"No visitors & No children under 10 & No parties\n",
|
1613 |
+
"----------\n",
|
1614 |
+
"No parties & No children under 10 & No smoking\n",
|
1615 |
+
"----------\n",
|
1616 |
+
"No visitors & No children under 10 & No smoking\n",
|
1617 |
+
"----------\n",
|
1618 |
+
"No pets & No parties\n",
|
1619 |
+
"----------\n",
|
1620 |
+
"No smoking & No parties\n",
|
1621 |
+
"----------\n",
|
1622 |
+
"No smoking & No children under 10\n",
|
1623 |
+
"----------\n",
|
1624 |
+
"No parties & No children under 10 & No visitors\n",
|
1625 |
+
"----------\n",
|
1626 |
+
"No children under 10 & No smoking\n",
|
1627 |
+
"----------\n",
|
1628 |
+
"No visitors & No pets & No smoking\n",
|
1629 |
+
"----------\n",
|
1630 |
+
"No pets\n",
|
1631 |
+
"----------\n",
|
1632 |
+
"No children under 10 & No pets\n",
|
1633 |
+
"----------\n",
|
1634 |
+
"No visitors & No smoking\n",
|
1635 |
+
"----------\n",
|
1636 |
+
"No smoking\n",
|
1637 |
+
"----------\n",
|
1638 |
+
"No parties & No smoking & No children under 10\n",
|
1639 |
+
"----------\n",
|
1640 |
+
"No parties & No smoking\n",
|
1641 |
+
"----------\n",
|
1642 |
+
"No smoking & No visitors & No parties\n",
|
1643 |
+
"----------\n",
|
1644 |
+
"No pets & No smoking\n",
|
1645 |
+
"----------\n",
|
1646 |
+
"No pets & No smoking & No parties\n",
|
1647 |
+
"----------\n",
|
1648 |
+
"No smoking & No children under 10 & No visitors\n",
|
1649 |
+
"----------\n",
|
1650 |
+
"No parties & No smoking & No visitors\n",
|
1651 |
+
"----------\n",
|
1652 |
+
"No visitors & No parties\n",
|
1653 |
+
"----------\n",
|
1654 |
+
"No visitors & No children under 10\n",
|
1655 |
+
"----------\n",
|
1656 |
+
"No parties & No smoking & No pets\n",
|
1657 |
+
"----------\n",
|
1658 |
+
"No children under 10 & No pets & No visitors\n",
|
1659 |
+
"----------\n",
|
1660 |
+
"No smoking & No pets & No parties\n",
|
1661 |
+
"----------\n",
|
1662 |
+
"No children under 10 & No smoking & No parties\n",
|
1663 |
+
"----------\n",
|
1664 |
+
"No visitors & No children under 10 & No pets\n",
|
1665 |
+
"----------\n",
|
1666 |
+
"No children under 10 & No parties\n",
|
1667 |
+
"----------\n",
|
1668 |
+
"No pets & No parties & No visitors\n",
|
1669 |
+
"----------\n",
|
1670 |
+
"No children under 10 & No visitors & No parties\n",
|
1671 |
+
"----------\n",
|
1672 |
+
"No parties & No pets\n",
|
1673 |
+
"----------\n",
|
1674 |
+
"No visitors & No parties & No pets\n",
|
1675 |
+
"----------\n",
|
1676 |
+
"No smoking & No pets & No visitors\n",
|
1677 |
+
"----------\n",
|
1678 |
+
"No smoking & No pets\n",
|
1679 |
+
"----------\n",
|
1680 |
+
"No visitors & No smoking & No children under 10\n",
|
1681 |
+
"----------\n",
|
1682 |
+
"No pets & No children under 10 & No parties\n",
|
1683 |
+
"----------\n",
|
1684 |
+
"No visitors & No pets & No children under 10\n",
|
1685 |
+
"----------\n",
|
1686 |
+
"No pets & No children under 10 & No smoking\n",
|
1687 |
+
"----------\n",
|
1688 |
+
"No parties & No visitors & No children under 10\n",
|
1689 |
+
"----------\n",
|
1690 |
+
"No pets & No smoking & No visitors\n",
|
1691 |
+
"----------\n",
|
1692 |
+
"No pets & No parties & No smoking\n",
|
1693 |
+
"----------\n",
|
1694 |
+
"No parties & No visitors & No smoking\n",
|
1695 |
+
"----------\n",
|
1696 |
+
"No pets & No visitors & No children under 10\n",
|
1697 |
+
"----------\n",
|
1698 |
+
"No parties & No visitors & No pets\n",
|
1699 |
+
"----------\n",
|
1700 |
+
"No children under 10\n",
|
1701 |
+
"----------\n",
|
1702 |
+
"No children under 10 & No pets & No parties\n",
|
1703 |
+
"----------\n",
|
1704 |
+
"No children under 10 & No visitors & No smoking\n",
|
1705 |
+
"----------\n",
|
1706 |
+
"No smoking & No children under 10 & No parties\n",
|
1707 |
+
"----------\n",
|
1708 |
+
"No pets & No parties & No children under 10\n",
|
1709 |
+
"----------\n",
|
1710 |
+
"No children under 10 & No visitors & No pets\n",
|
1711 |
+
"----------\n",
|
1712 |
+
"No parties & No pets & No smoking\n",
|
1713 |
+
"----------\n",
|
1714 |
+
"No pets & No children under 10 & No visitors\n",
|
1715 |
+
"----------\n",
|
1716 |
+
"No parties & No children under 10 & No pets\n",
|
1717 |
+
"----------\n",
|
1718 |
+
"No parties & No pets & No children under 10\n",
|
1719 |
+
"----------\n",
|
1720 |
+
"No smoking & No parties & No visitors\n",
|
1721 |
+
"----------\n",
|
1722 |
+
"No parties\n",
|
1723 |
+
"----------\n",
|
1724 |
+
"No visitors & No pets & No parties\n",
|
1725 |
+
"----------\n",
|
1726 |
+
"No children under 10 & No visitors\n",
|
1727 |
+
"----------\n",
|
1728 |
+
"No smoking & No children under 10 & No pets\n",
|
1729 |
+
"----------\n",
|
1730 |
+
"No smoking & No parties & No children under 10\n",
|
1731 |
+
"----------\n",
|
1732 |
+
"No visitors & No smoking & No parties\n",
|
1733 |
+
"----------\n",
|
1734 |
+
"No pets & No visitors & No parties\n",
|
1735 |
+
"----------\n",
|
1736 |
+
"No smoking & No visitors\n",
|
1737 |
+
"----------\n",
|
1738 |
+
"No smoking & No visitors & No children under 10\n",
|
1739 |
+
"----------\n",
|
1740 |
+
"No visitors & No smoking & No pets\n",
|
1741 |
+
"----------\n",
|
1742 |
+
"No smoking & No visitors & No pets\n",
|
1743 |
+
"----------\n",
|
1744 |
+
"No visitors & No parties & No smoking\n",
|
1745 |
+
"----------\n",
|
1746 |
+
"No smoking & No pets & No children under 10\n",
|
1747 |
+
"----------\n",
|
1748 |
+
"No children under 10 & No parties & No visitors\n",
|
1749 |
+
"----------\n",
|
1750 |
+
"No visitors & No parties & No children under 10\n",
|
1751 |
+
"----------\n"
|
1752 |
+
]
|
1753 |
+
}
|
1754 |
+
],
|
1755 |
+
"source": [
|
1756 |
+
"maximum_occupancy_set = set()\n",
|
1757 |
+
"for unit in new_data:\n",
|
1758 |
+
" maximum_occupancy_set.add(unit['house_rules'])\n",
|
1759 |
+
"for unit in maximum_occupancy_set:\n",
|
1760 |
+
" print(unit)\n",
|
1761 |
+
" print(\"----------\")"
|
1762 |
+
]
|
1763 |
+
},
|
1764 |
+
{
|
1765 |
+
"cell_type": "code",
|
1766 |
+
"execution_count": 45,
|
1767 |
+
"id": "8056052a",
|
1768 |
+
"metadata": {},
|
1769 |
+
"outputs": [
|
1770 |
+
{
|
1771 |
+
"data": {
|
1772 |
+
"text/html": [
|
1773 |
+
"<div>\n",
|
1774 |
+
"<style scoped>\n",
|
1775 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
1776 |
+
" vertical-align: middle;\n",
|
1777 |
+
" }\n",
|
1778 |
+
"\n",
|
1779 |
+
" .dataframe tbody tr th {\n",
|
1780 |
+
" vertical-align: top;\n",
|
1781 |
+
" }\n",
|
1782 |
+
"\n",
|
1783 |
+
" .dataframe thead th {\n",
|
1784 |
+
" text-align: right;\n",
|
1785 |
+
" }\n",
|
1786 |
+
"</style>\n",
|
1787 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
1788 |
+
" <thead>\n",
|
1789 |
+
" <tr style=\"text-align: right;\">\n",
|
1790 |
+
" <th></th>\n",
|
1791 |
+
" <th>NAME</th>\n",
|
1792 |
+
" <th>room type</th>\n",
|
1793 |
+
" <th>price</th>\n",
|
1794 |
+
" <th>minimum nights</th>\n",
|
1795 |
+
" <th>review rate number</th>\n",
|
1796 |
+
" <th>house_rules</th>\n",
|
1797 |
+
" <th>maximum occupancy</th>\n",
|
1798 |
+
" <th>city</th>\n",
|
1799 |
+
" </tr>\n",
|
1800 |
+
" </thead>\n",
|
1801 |
+
" <tbody>\n",
|
1802 |
+
" <tr>\n",
|
1803 |
+
" <th>0</th>\n",
|
1804 |
+
" <td>Beautiful room upper manhttn.</td>\n",
|
1805 |
+
" <td>Private room</td>\n",
|
1806 |
+
" <td>131.0</td>\n",
|
1807 |
+
" <td>1.0</td>\n",
|
1808 |
+
" <td>2.0</td>\n",
|
1809 |
+
" <td>No smoking</td>\n",
|
1810 |
+
" <td>1</td>\n",
|
1811 |
+
" <td>Christiansted</td>\n",
|
1812 |
+
" </tr>\n",
|
1813 |
+
" <tr>\n",
|
1814 |
+
" <th>1</th>\n",
|
1815 |
+
" <td>Roomy and Comftable Room</td>\n",
|
1816 |
+
" <td>Private room</td>\n",
|
1817 |
+
" <td>548.0</td>\n",
|
1818 |
+
" <td>10.0</td>\n",
|
1819 |
+
" <td>5.0</td>\n",
|
1820 |
+
" <td>No children under 10 & No parties</td>\n",
|
1821 |
+
" <td>2</td>\n",
|
1822 |
+
" <td>Laredo</td>\n",
|
1823 |
+
" </tr>\n",
|
1824 |
+
" <tr>\n",
|
1825 |
+
" <th>2</th>\n",
|
1826 |
+
" <td>BIG room with bath & balcony in BK!</td>\n",
|
1827 |
+
" <td>Private room</td>\n",
|
1828 |
+
" <td>1123.0</td>\n",
|
1829 |
+
" <td>1.0</td>\n",
|
1830 |
+
" <td>4.0</td>\n",
|
1831 |
+
" <td>No parties</td>\n",
|
1832 |
+
" <td>2</td>\n",
|
1833 |
+
" <td>Louisville</td>\n",
|
1834 |
+
" </tr>\n",
|
1835 |
+
" <tr>\n",
|
1836 |
+
" <th>3</th>\n",
|
1837 |
+
" <td>4A-</td>\n",
|
1838 |
+
" <td>Entire home/apt</td>\n",
|
1839 |
+
" <td>225.0</td>\n",
|
1840 |
+
" <td>30.0</td>\n",
|
1841 |
+
" <td>4.0</td>\n",
|
1842 |
+
" <td>No pets</td>\n",
|
1843 |
+
" <td>3</td>\n",
|
1844 |
+
" <td>Greensboro</td>\n",
|
1845 |
+
" </tr>\n",
|
1846 |
+
" <tr>\n",
|
1847 |
+
" <th>4</th>\n",
|
1848 |
+
" <td>Nice and Comfortable Private Room</td>\n",
|
1849 |
+
" <td>Private room</td>\n",
|
1850 |
+
" <td>761.0</td>\n",
|
1851 |
+
" <td>2.0</td>\n",
|
1852 |
+
" <td>1.0</td>\n",
|
1853 |
+
" <td>No smoking & No parties</td>\n",
|
1854 |
+
" <td>2</td>\n",
|
1855 |
+
" <td>Cape Girardeau</td>\n",
|
1856 |
+
" </tr>\n",
|
1857 |
+
" <tr>\n",
|
1858 |
+
" <th>...</th>\n",
|
1859 |
+
" <td>...</td>\n",
|
1860 |
+
" <td>...</td>\n",
|
1861 |
+
" <td>...</td>\n",
|
1862 |
+
" <td>...</td>\n",
|
1863 |
+
" <td>...</td>\n",
|
1864 |
+
" <td>...</td>\n",
|
1865 |
+
" <td>...</td>\n",
|
1866 |
+
" <td>...</td>\n",
|
1867 |
+
" </tr>\n",
|
1868 |
+
" <tr>\n",
|
1869 |
+
" <th>5042</th>\n",
|
1870 |
+
" <td>Amazing LOFT in Prime Williamsburg</td>\n",
|
1871 |
+
" <td>Private room</td>\n",
|
1872 |
+
" <td>249.0</td>\n",
|
1873 |
+
" <td>5.0</td>\n",
|
1874 |
+
" <td>5.0</td>\n",
|
1875 |
+
" <td>No pets</td>\n",
|
1876 |
+
" <td>2</td>\n",
|
1877 |
+
" <td>Trenton</td>\n",
|
1878 |
+
" </tr>\n",
|
1879 |
+
" <tr>\n",
|
1880 |
+
" <th>5043</th>\n",
|
1881 |
+
" <td>Private Queen Bedroom in Brooklyn</td>\n",
|
1882 |
+
" <td>Private room</td>\n",
|
1883 |
+
" <td>1032.0</td>\n",
|
1884 |
+
" <td>1.0</td>\n",
|
1885 |
+
" <td>1.0</td>\n",
|
1886 |
+
" <td>No pets</td>\n",
|
1887 |
+
" <td>1</td>\n",
|
1888 |
+
" <td>Des Moines</td>\n",
|
1889 |
+
" </tr>\n",
|
1890 |
+
" <tr>\n",
|
1891 |
+
" <th>5044</th>\n",
|
1892 |
+
" <td>Bushwick / Bed Sty Retreat</td>\n",
|
1893 |
+
" <td>Private room</td>\n",
|
1894 |
+
" <td>546.0</td>\n",
|
1895 |
+
" <td>2.0</td>\n",
|
1896 |
+
" <td>4.0</td>\n",
|
1897 |
+
" <td>No children under 10 & No visitors & No smoking</td>\n",
|
1898 |
+
" <td>2</td>\n",
|
1899 |
+
" <td>Scottsbluff</td>\n",
|
1900 |
+
" </tr>\n",
|
1901 |
+
" <tr>\n",
|
1902 |
+
" <th>5045</th>\n",
|
1903 |
+
" <td>Charming Mid-Century Studio</td>\n",
|
1904 |
+
" <td>Entire home/apt</td>\n",
|
1905 |
+
" <td>1115.0</td>\n",
|
1906 |
+
" <td>2.0</td>\n",
|
1907 |
+
" <td>5.0</td>\n",
|
1908 |
+
" <td>No pets & No children under 10</td>\n",
|
1909 |
+
" <td>7</td>\n",
|
1910 |
+
" <td>Butte</td>\n",
|
1911 |
+
" </tr>\n",
|
1912 |
+
" <tr>\n",
|
1913 |
+
" <th>5046</th>\n",
|
1914 |
+
" <td>3 Bed/ 2 Bath Full Apt. BK Heights</td>\n",
|
1915 |
+
" <td>Entire home/apt</td>\n",
|
1916 |
+
" <td>396.0</td>\n",
|
1917 |
+
" <td>2.0</td>\n",
|
1918 |
+
" <td>1.0</td>\n",
|
1919 |
+
" <td>No smoking</td>\n",
|
1920 |
+
" <td>3</td>\n",
|
1921 |
+
" <td>Norfolk</td>\n",
|
1922 |
+
" </tr>\n",
|
1923 |
+
" </tbody>\n",
|
1924 |
+
"</table>\n",
|
1925 |
+
"<p>5047 rows × 8 columns</p>\n",
|
1926 |
+
"</div>"
|
1927 |
+
],
|
1928 |
+
"text/plain": [
|
1929 |
+
" NAME room type price \n",
|
1930 |
+
"0 Beautiful room upper manhttn. Private room 131.0 \\\n",
|
1931 |
+
"1 Roomy and Comftable Room Private room 548.0 \n",
|
1932 |
+
"2 BIG room with bath & balcony in BK! Private room 1123.0 \n",
|
1933 |
+
"3 4A- Entire home/apt 225.0 \n",
|
1934 |
+
"4 Nice and Comfortable Private Room Private room 761.0 \n",
|
1935 |
+
"... ... ... ... \n",
|
1936 |
+
"5042 Amazing LOFT in Prime Williamsburg Private room 249.0 \n",
|
1937 |
+
"5043 Private Queen Bedroom in Brooklyn Private room 1032.0 \n",
|
1938 |
+
"5044 Bushwick / Bed Sty Retreat Private room 546.0 \n",
|
1939 |
+
"5045 Charming Mid-Century Studio Entire home/apt 1115.0 \n",
|
1940 |
+
"5046 3 Bed/ 2 Bath Full Apt. BK Heights Entire home/apt 396.0 \n",
|
1941 |
+
"\n",
|
1942 |
+
" minimum nights review rate number \n",
|
1943 |
+
"0 1.0 2.0 \\\n",
|
1944 |
+
"1 10.0 5.0 \n",
|
1945 |
+
"2 1.0 4.0 \n",
|
1946 |
+
"3 30.0 4.0 \n",
|
1947 |
+
"4 2.0 1.0 \n",
|
1948 |
+
"... ... ... \n",
|
1949 |
+
"5042 5.0 5.0 \n",
|
1950 |
+
"5043 1.0 1.0 \n",
|
1951 |
+
"5044 2.0 4.0 \n",
|
1952 |
+
"5045 2.0 5.0 \n",
|
1953 |
+
"5046 2.0 1.0 \n",
|
1954 |
+
"\n",
|
1955 |
+
" house_rules maximum occupancy \n",
|
1956 |
+
"0 No smoking 1 \\\n",
|
1957 |
+
"1 No children under 10 & No parties 2 \n",
|
1958 |
+
"2 No parties 2 \n",
|
1959 |
+
"3 No pets 3 \n",
|
1960 |
+
"4 No smoking & No parties 2 \n",
|
1961 |
+
"... ... ... \n",
|
1962 |
+
"5042 No pets 2 \n",
|
1963 |
+
"5043 No pets 1 \n",
|
1964 |
+
"5044 No children under 10 & No visitors & No smoking 2 \n",
|
1965 |
+
"5045 No pets & No children under 10 7 \n",
|
1966 |
+
"5046 No smoking 3 \n",
|
1967 |
+
"\n",
|
1968 |
+
" city \n",
|
1969 |
+
"0 Christiansted \n",
|
1970 |
+
"1 Laredo \n",
|
1971 |
+
"2 Louisville \n",
|
1972 |
+
"3 Greensboro \n",
|
1973 |
+
"4 Cape Girardeau \n",
|
1974 |
+
"... ... \n",
|
1975 |
+
"5042 Trenton \n",
|
1976 |
+
"5043 Des Moines \n",
|
1977 |
+
"5044 Scottsbluff \n",
|
1978 |
+
"5045 Butte \n",
|
1979 |
+
"5046 Norfolk \n",
|
1980 |
+
"\n",
|
1981 |
+
"[5047 rows x 8 columns]"
|
1982 |
+
]
|
1983 |
+
},
|
1984 |
+
"execution_count": 45,
|
1985 |
+
"metadata": {},
|
1986 |
+
"output_type": "execute_result"
|
1987 |
+
}
|
1988 |
+
],
|
1989 |
+
"source": [
|
1990 |
+
"df"
|
1991 |
+
]
|
1992 |
+
},
|
1993 |
+
{
|
1994 |
+
"cell_type": "code",
|
1995 |
+
"execution_count": 44,
|
1996 |
+
"id": "54423e0d",
|
1997 |
+
"metadata": {},
|
1998 |
+
"outputs": [],
|
1999 |
+
"source": [
|
2000 |
+
"df = pd.DataFrame(new_data)\n",
|
2001 |
+
"df.to_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
|
2002 |
+
]
|
2003 |
+
},
|
2004 |
+
{
|
2005 |
+
"cell_type": "code",
|
2006 |
+
"execution_count": null,
|
2007 |
+
"id": "5767aa80",
|
2008 |
+
"metadata": {},
|
2009 |
+
"outputs": [],
|
2010 |
+
"source": [
|
2011 |
+
"df.rename(columns={'old_name1': 'new_name1', 'old_name2': 'new_name2'}, inplace=True)\n",
|
2012 |
+
"df.to_csv('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')"
|
2013 |
+
]
|
2014 |
+
}
|
2015 |
+
],
|
2016 |
+
"metadata": {
|
2017 |
+
"kernelspec": {
|
2018 |
+
"display_name": "Python 3 (ipykernel)",
|
2019 |
+
"language": "python",
|
2020 |
+
"name": "python3"
|
2021 |
+
},
|
2022 |
+
"language_info": {
|
2023 |
+
"codemirror_mode": {
|
2024 |
+
"name": "ipython",
|
2025 |
+
"version": 3
|
2026 |
+
},
|
2027 |
+
"file_extension": ".py",
|
2028 |
+
"mimetype": "text/x-python",
|
2029 |
+
"name": "python",
|
2030 |
+
"nbconvert_exporter": "python",
|
2031 |
+
"pygments_lexer": "ipython3",
|
2032 |
+
"version": "3.9.16"
|
2033 |
+
}
|
2034 |
+
},
|
2035 |
+
"nbformat": 4,
|
2036 |
+
"nbformat_minor": 5
|
2037 |
+
}
|
tools/accommodations/test.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tools.accommodations.apis import Hotels
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
# 设置显示所有列
|
5 |
+
pd.set_option('display.max_columns', 100)
|
6 |
+
|
7 |
+
# 设置显示所有行
|
8 |
+
pd.set_option('display.max_rows', 100)
|
9 |
+
|
10 |
+
hotel = Hotels('/home/xj/toolAugEnv/code/toolConstraint/database/hotels/clean_hotels_2022.csv')
|
11 |
+
data = hotel.run('New York')
|
12 |
+
print(data)
|
tools/attractions/__pycache__/apis.cpython-39.pyc
ADDED
Binary file (1.55 kB). View file
|
|
tools/attractions/apis.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
from pandas import DataFrame
|
3 |
+
from typing import Optional
|
4 |
+
from annotation.src.utils import extract_before_parenthesis
|
5 |
+
|
6 |
+
|
7 |
+
class Attractions:
|
8 |
+
def __init__(self, path="../database/attractions/attractions.csv"):
|
9 |
+
self.path = path
|
10 |
+
self.data = pd.read_csv(self.path).dropna()[['Name','Latitude','Longitude','Address','Phone','Website',"City"]]
|
11 |
+
print("Attractions loaded.")
|
12 |
+
|
13 |
+
def load_db(self):
|
14 |
+
self.data = pd.read_csv(self.path)
|
15 |
+
|
16 |
+
def run(self,
|
17 |
+
city: str,
|
18 |
+
) -> DataFrame:
|
19 |
+
"""Search for Accommodations by city and date."""
|
20 |
+
results = self.data[self.data["City"] == city]
|
21 |
+
# the results should show the index
|
22 |
+
results = results.reset_index(drop=True)
|
23 |
+
if len(results) == 0:
|
24 |
+
return "There is no attraction in this city."
|
25 |
+
return results
|
26 |
+
|
27 |
+
def run_for_annotation(self,
|
28 |
+
city: str,
|
29 |
+
) -> DataFrame:
|
30 |
+
"""Search for Accommodations by city and date."""
|
31 |
+
results = self.data[self.data["City"] == extract_before_parenthesis(city)]
|
32 |
+
# the results should show the index
|
33 |
+
results = results.reset_index(drop=True)
|
34 |
+
return results
|
tools/attractions/test.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tools.attractions.apis import Attractions
|
2 |
+
import pandas as pd
|
3 |
+
import sys
|
4 |
+
import os
|
5 |
+
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
|
6 |
+
os.chdir(os.path.dirname(os.path.abspath(__file__)))
|
7 |
+
unique_cities = Attractions(path="../../database/attractions/attractions.csv").data['City'].unique()
|
8 |
+
df = Attractions(path="../../database/attractions/attractions.csv").data
|
9 |
+
print(len(df))
|
10 |
+
citySet = open('../../database/background/citySet.txt','r').read().split('\n')
|
11 |
+
cnt = 0
|
12 |
+
for city in unique_cities:
|
13 |
+
if city not in citySet:
|
14 |
+
df = df[df['City'] != city]
|
15 |
+
print(len(df))
|
16 |
+
|
17 |
+
df.to_csv('../../database/attractions/attractions2.csv', index=False)
|
tools/cities/__pycache__/apis.cpython-39.pyc
ADDED
Binary file (1.1 kB). View file
|
|
tools/cities/apis.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pandas import DataFrame
|
2 |
+
|
3 |
+
class Cities:
|
4 |
+
def __init__(self ,path="../database/background/citySet_with_states.txt") -> None:
|
5 |
+
self.path = path
|
6 |
+
self.load_data()
|
7 |
+
print("Cities loaded.")
|
8 |
+
|
9 |
+
def load_data(self):
|
10 |
+
cityStateMapping = open(self.path, "r").read().strip().split("\n")
|
11 |
+
self.data = {}
|
12 |
+
for unit in cityStateMapping:
|
13 |
+
city, state = unit.split("\t")
|
14 |
+
if state not in self.data:
|
15 |
+
self.data[state] = [city]
|
16 |
+
else:
|
17 |
+
self.data[state].append(city)
|
18 |
+
|
19 |
+
def run(self, state) -> dict:
|
20 |
+
if state not in self.data:
|
21 |
+
return ValueError("Invalid State")
|
22 |
+
else:
|
23 |
+
return self.data[state]
|
tools/cities/test.py
ADDED
File without changes
|