Spaces:
Sleeping
Sleeping
add init py
Browse files- .DS_Store +0 -0
- RECODE_speckle_utils +0 -1
- speckleUtils/__init__.py +0 -0
- speckleUtils/color_maps.py +36 -0
- speckleUtils/data_utils.py +616 -0
- speckleUtils/plots_utils.py +814 -0
- speckleUtils/speckle_utils.py +696 -0
- tripGenerationFunc.py +6 -3
.DS_Store
CHANGED
Binary files a/.DS_Store and b/.DS_Store differ
|
|
RECODE_speckle_utils
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
Subproject commit 9d8b034751522ec0220f9dfffd702292abce8de4
|
|
|
|
speckleUtils/__init__.py
ADDED
File without changes
|
speckleUtils/color_maps.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def gh_color_blueRed():
|
2 |
+
# grasshoper color scheme
|
3 |
+
color_list = [[15,16,115],
|
4 |
+
[177,198,242],
|
5 |
+
[251,244,121],
|
6 |
+
[222,140,61],
|
7 |
+
[183,60,34]]
|
8 |
+
# Scale RGB values to [0,1] range
|
9 |
+
color_list = [[c/255. for c in color] for color in color_list]
|
10 |
+
return color_list
|
11 |
+
|
12 |
+
def gh_color_whiteRed():
|
13 |
+
# grasshoper color scheme
|
14 |
+
color_list = [[255,255,255],
|
15 |
+
[111,19,12],
|
16 |
+
]
|
17 |
+
# Scale RGB values to [0,1] range
|
18 |
+
color_list = [[c/255. for c in color] for color in color_list]
|
19 |
+
return color_list
|
20 |
+
|
21 |
+
def gh_color_cluster():
|
22 |
+
# grasshoper color scheme
|
23 |
+
color_list = [
|
24 |
+
[181,200,230],
|
25 |
+
[227,170,170],
|
26 |
+
[200,200,200],
|
27 |
+
[250,200,254],
|
28 |
+
[200,180,220],
|
29 |
+
[180,220,170],
|
30 |
+
]
|
31 |
+
# Scale RGB values to [0,1] range
|
32 |
+
color_list = [[c/255. for c in color] for color in color_list]
|
33 |
+
return color_list
|
34 |
+
|
35 |
+
#---
|
36 |
+
#---
|
speckleUtils/data_utils.py
ADDED
@@ -0,0 +1,616 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
import copy
|
5 |
+
import os
|
6 |
+
import csv
|
7 |
+
import io
|
8 |
+
import json
|
9 |
+
import requests
|
10 |
+
|
11 |
+
try:
|
12 |
+
from fuzzywuzzy import fuzz
|
13 |
+
except:
|
14 |
+
pass
|
15 |
+
|
16 |
+
def helper():
|
17 |
+
"""
|
18 |
+
Prints out the help message for this module.
|
19 |
+
"""
|
20 |
+
print("This module contains a set of utility functions for data processing.")
|
21 |
+
print("______________________________________________________________________")
|
22 |
+
print("for detailed help call >>> help(speckle_utils.function_name) <<< ")
|
23 |
+
print("______________________________________________________________________")
|
24 |
+
print("available functions:")
|
25 |
+
print("cleanData(data, mode='drop', num_only=False) -> clean dataframes, series or numpy arrays" )
|
26 |
+
print( """ sort_and_match_df(A, B, uuid_column) -> merges two dataframes by a common uuid comon (best practice: always use this)""")
|
27 |
+
print("transform_to_score(data, minPts, maxPts, t_low, t_high, cull_invalid=False) -> transform data to a score based on percentiles and provided points")
|
28 |
+
print("colab_create_directory(base_name) -> create a directory with the given name, if it already exists, add a number to the end of the name, usefull for colab")
|
29 |
+
print("colab_zip_download_folder(dir_name) -> zips and downloads a directory from colab. will only work in google colaboratory ")
|
30 |
+
|
31 |
+
|
32 |
+
def cleanData(data, mode="drop", num_only=False, print_report=True):
|
33 |
+
"""
|
34 |
+
Cleans data by handling missing or null values according to the specified mode.
|
35 |
+
|
36 |
+
Args:
|
37 |
+
data (numpy.ndarray, pandas.DataFrame, pandas.Series): Input data to be cleaned.
|
38 |
+
mode (str, optional): Specifies the method to handle missing or null values.
|
39 |
+
"drop" drops rows with missing values (default),
|
40 |
+
"replace_zero" replaces missing values with zero,
|
41 |
+
"replace_mean" replaces missing values with the mean of the column.
|
42 |
+
num_only (bool, optional): If True and data is a DataFrame, only numeric columns are kept. Defaults to False.#
|
43 |
+
print_report (bool, optional): if True the report is printed to the console. Defaults to True.
|
44 |
+
|
45 |
+
Returns:
|
46 |
+
numpy.ndarray, pandas.DataFrame, pandas.Series: Cleaned data with the same type as the input.
|
47 |
+
|
48 |
+
|
49 |
+
Raises:
|
50 |
+
ValueError: If the input data type is not supported (must be numpy.ndarray, pandas.DataFrame or pandas.Series).
|
51 |
+
|
52 |
+
This function checks the type of the input data and applies the appropriate cleaning operation accordingly.
|
53 |
+
It supports pandas DataFrame, pandas Series, and numpy array. For pandas DataFrame, it can optionally
|
54 |
+
convert and keep only numeric columns.
|
55 |
+
"""
|
56 |
+
report = {}
|
57 |
+
if isinstance(data, pd.DataFrame):
|
58 |
+
initial_cols = data.columns.tolist()
|
59 |
+
initial_rows = data.shape[0]
|
60 |
+
if num_only:
|
61 |
+
# attempt casting before doing this selection
|
62 |
+
data = data.apply(pd.to_numeric, errors='coerce')
|
63 |
+
data = data.select_dtypes(include=['int64', 'float64'])
|
64 |
+
report['dropped_cols'] = list(set(initial_cols) - set(data.columns.tolist()))
|
65 |
+
|
66 |
+
if mode == "drop":
|
67 |
+
data = data.dropna()
|
68 |
+
report['dropped_rows'] = initial_rows - data.shape[0]
|
69 |
+
elif mode=="replace_zero":
|
70 |
+
data = data.fillna(0)
|
71 |
+
elif mode=="replace_mean":
|
72 |
+
data = data.fillna(data.mean())
|
73 |
+
|
74 |
+
elif isinstance(data, pd.Series):
|
75 |
+
initial_length = len(data)
|
76 |
+
if mode == "drop":
|
77 |
+
data = data.dropna()
|
78 |
+
report['dropped_rows'] = initial_length - len(data)
|
79 |
+
elif mode=="replace_zero":
|
80 |
+
data = data.fillna(0)
|
81 |
+
elif mode=="replace_mean":
|
82 |
+
data = data.fillna(data.mean())
|
83 |
+
|
84 |
+
elif isinstance(data, np.ndarray):
|
85 |
+
initial_length = data.size
|
86 |
+
if mode=="drop":
|
87 |
+
data = data[~np.isnan(data)]
|
88 |
+
report['dropped_rows'] = initial_length - data.size
|
89 |
+
elif mode=="replace_zero":
|
90 |
+
data = np.nan_to_num(data, nan=0)
|
91 |
+
elif mode=="replace_mean":
|
92 |
+
data = np.where(np.isnan(data), np.nanmean(data), data)
|
93 |
+
|
94 |
+
else:
|
95 |
+
raise ValueError("Unsupported data type")
|
96 |
+
if print_report:
|
97 |
+
print(report)
|
98 |
+
return data
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
def sort_and_match_df(A, B, uuid_column):
|
103 |
+
"""
|
104 |
+
Sorts and matches DataFrame B to A based on a shared uuid_column.
|
105 |
+
Prioritizes uuid_column as an index if present, otherwise uses it as a column.
|
106 |
+
|
107 |
+
Parameters:
|
108 |
+
A, B (DataFrame): Input DataFrames to be sorted and matched.
|
109 |
+
uuid_column (str): Shared column/index for matching rows.
|
110 |
+
|
111 |
+
Returns:
|
112 |
+
DataFrame: Resulting DataFrame after left join of A and B on uuid_column.
|
113 |
+
"""
|
114 |
+
if uuid_column in A.columns:
|
115 |
+
A = A.set_index(uuid_column, drop=False)
|
116 |
+
if uuid_column in B.columns:
|
117 |
+
B = B.set_index(uuid_column, drop=False)
|
118 |
+
|
119 |
+
merged_df = pd.merge(A, B, left_index=True, right_index=True, how='left')
|
120 |
+
return merged_df.reset_index(drop=False)
|
121 |
+
|
122 |
+
|
123 |
+
def sort_and_match_dfs(dfs, uuid_column):
|
124 |
+
"""
|
125 |
+
Sorts and matches all DataFrames in list based on a shared uuid_column.
|
126 |
+
Prioritizes uuid_column as an index if present, otherwise uses it as a column.
|
127 |
+
Raises a warning if any two DataFrames have overlapping column names.
|
128 |
+
|
129 |
+
Parameters:
|
130 |
+
dfs (list): A list of DataFrames to be sorted and matched.
|
131 |
+
uuid_column (str): Shared column/index for matching rows.
|
132 |
+
|
133 |
+
Returns:
|
134 |
+
DataFrame: Resulting DataFrame after successive left joins on uuid_column.
|
135 |
+
"""
|
136 |
+
if not dfs:
|
137 |
+
raise ValueError("The input list of DataFrames is empty")
|
138 |
+
|
139 |
+
# Convert uuid_column to index if it's a column
|
140 |
+
for i, df in enumerate(dfs):
|
141 |
+
if uuid_column in df.columns:
|
142 |
+
dfs[i] = df.set_index(uuid_column, drop=False)
|
143 |
+
|
144 |
+
# Check for overlapping column names
|
145 |
+
all_columns = [set(df.columns) for df in dfs]
|
146 |
+
for i, columns_i in enumerate(all_columns):
|
147 |
+
for j, columns_j in enumerate(all_columns[i+1:], start=i+1):
|
148 |
+
overlapping_columns = columns_i.intersection(columns_j) - {uuid_column}
|
149 |
+
if overlapping_columns:
|
150 |
+
print(f"Warning: DataFrames at indices {i} and {j} have overlapping column(s): {', '.join(overlapping_columns)}")
|
151 |
+
|
152 |
+
result_df = dfs[0]
|
153 |
+
for df in dfs[1:]:
|
154 |
+
result_df = pd.merge(result_df, df, left_index=True, right_index=True, how='left')
|
155 |
+
|
156 |
+
return result_df.reset_index(drop=False)
|
157 |
+
|
158 |
+
|
159 |
+
|
160 |
+
|
161 |
+
def transform_to_score(data, minPts, maxPts, t_low, t_high, cull_invalid=False):
|
162 |
+
"""
|
163 |
+
Transforms data to a score based on percentiles and provided points.
|
164 |
+
|
165 |
+
Args:
|
166 |
+
data (numpy.array or pandas.Series): Input data to be transformed.
|
167 |
+
minPts (float): The minimum points to be assigned.
|
168 |
+
maxPts (float): The maximum points to be assigned.
|
169 |
+
t_low (float): The lower percentile threshold.
|
170 |
+
t_high (float): The upper percentile threshold.
|
171 |
+
cull_invalid (bool, optional): If True, invalid data is removed. Defaults to False.
|
172 |
+
|
173 |
+
Returns:
|
174 |
+
numpy.array: The transformed data, where each element has been converted to a score based on its percentile rank.
|
175 |
+
|
176 |
+
This function calculates the t_low and t_high percentiles of the input data, and uses linear interpolation
|
177 |
+
to transform each data point to a score between minPts and maxPts. Any data point that falls above the t_high
|
178 |
+
percentile is given a score of maxPts. If cull_invalid is True, any invalid data points (such as NaNs or
|
179 |
+
infinite values) are removed before the transformation is applied.
|
180 |
+
"""
|
181 |
+
|
182 |
+
# If cull_invalid is True, the data is cleaned and invalid data is removed.
|
183 |
+
if cull_invalid:
|
184 |
+
inp_data = cleanData(inp_data, mode="drop", num_only=True)
|
185 |
+
|
186 |
+
# Calculate the percentile values based on the data
|
187 |
+
percentile_low = np.percentile(data, t_low)
|
188 |
+
percentile_high = np.percentile(data, t_high)
|
189 |
+
|
190 |
+
# Create a copy of the data to store the transformed points
|
191 |
+
transformed_data = data.copy()
|
192 |
+
|
193 |
+
# Apply linear interpolation between minPts and maxPts
|
194 |
+
transformed_data = np.interp(transformed_data, [percentile_low, percentile_high], [minPts, maxPts])
|
195 |
+
|
196 |
+
# Replace values above the percentile threshold with maxPts
|
197 |
+
transformed_data[transformed_data >= percentile_high] = maxPts
|
198 |
+
|
199 |
+
return transformed_data
|
200 |
+
|
201 |
+
|
202 |
+
def colab_create_directory(base_name):
|
203 |
+
""" creates a directory with the given name, if it already exists, add a number to the end of the name.
|
204 |
+
Usefull for colab to batch save e.g. images and avoid overwriting.
|
205 |
+
Args:
|
206 |
+
base_name (str): name of the directory to create
|
207 |
+
Returns:
|
208 |
+
str: name of the created directory"""
|
209 |
+
counter = 1
|
210 |
+
dir_name = base_name
|
211 |
+
|
212 |
+
while os.path.exists(dir_name):
|
213 |
+
dir_name = f"{base_name}_{counter}"
|
214 |
+
counter += 1
|
215 |
+
|
216 |
+
os.mkdir(dir_name)
|
217 |
+
return dir_name
|
218 |
+
|
219 |
+
def smart_round(x):
|
220 |
+
if abs(x) >= 1000:
|
221 |
+
return round(x)
|
222 |
+
elif abs(x) >= 10:
|
223 |
+
return round(x, 1)
|
224 |
+
elif abs(x) >= 1:
|
225 |
+
return round(x, 2)
|
226 |
+
else:
|
227 |
+
return round(x, 3)
|
228 |
+
|
229 |
+
def colab_zip_download_folder(dir_name):
|
230 |
+
""" zips and downloads a directory from colab. will only work in google colab
|
231 |
+
Args:
|
232 |
+
dir_name (str): name of the directory to zip and download
|
233 |
+
returns:
|
234 |
+
None, file will be downloaded to the local machine"""
|
235 |
+
try:
|
236 |
+
# zip the directory
|
237 |
+
get_ipython().system('zip -r /content/{dir_name}.zip /content/{dir_name}')
|
238 |
+
|
239 |
+
# download the zip file
|
240 |
+
from google.colab import files
|
241 |
+
files.download(f"/content/{dir_name}.zip")
|
242 |
+
except:
|
243 |
+
print("something went wrong, this function will only work in google colab, make sure to import the necessary packages. >>> from google.colab import files <<<" )
|
244 |
+
|
245 |
+
|
246 |
+
|
247 |
+
|
248 |
+
def generate__cluster_prompt(data_context, analysis_goal, column_descriptions, cluster_stat, complexity, exemplary_cluster_names_descriptions=None, creativity=None):
|
249 |
+
# Define complexity levels
|
250 |
+
complexity_levels = {
|
251 |
+
1: "Please explain the findings in a simple way, suitable for someone with no knowledge of statistics or data science.",
|
252 |
+
2: "Please explain the findings in moderate detail, suitable for someone with basic understanding of statistics or data science.",
|
253 |
+
3: "Please explain the findings in great detail, suitable for someone with advanced understanding of statistics or data science."
|
254 |
+
}
|
255 |
+
|
256 |
+
# Start the prompt
|
257 |
+
prompt = f"The data you are analyzing is from the following context: {data_context}. The goal of this analysis is: {analysis_goal}.\n\n"
|
258 |
+
|
259 |
+
# Add column descriptions
|
260 |
+
prompt += "The data consists of the following columns:\n"
|
261 |
+
for column, description in column_descriptions.items():
|
262 |
+
prompt += f"- {column}: {description}\n"
|
263 |
+
|
264 |
+
# Add cluster stat and ask for generation
|
265 |
+
prompt += "\nBased on the data, the following cluster has been identified:\n"
|
266 |
+
prompt += f"\nCluster ID: {cluster_stat['cluster_id']}\n"
|
267 |
+
for column, stats in cluster_stat['columns'].items():
|
268 |
+
prompt += f"- {column}:\n"
|
269 |
+
for stat, value in stats.items():
|
270 |
+
prompt += f" - {stat}: {value}\n"
|
271 |
+
|
272 |
+
# Adjust the prompt based on whether examples are provided
|
273 |
+
if exemplary_cluster_names_descriptions is not None and creativity is not None:
|
274 |
+
prompt += f"\nPlease generate a name and description for this cluster, using a creativity level of {creativity} (where 0 is sticking closely to the examples and 1 is completely original). The examples provided are: {exemplary_cluster_names_descriptions}\n"
|
275 |
+
else:
|
276 |
+
prompt += "\nPlease generate a name and description for this cluster. Be creative and original in your descriptions.\n"
|
277 |
+
|
278 |
+
prompt += "Please fill the following JSON template with the cluster name and two types of descriptions:\n"
|
279 |
+
prompt += "{\n \"cluster_name\": \"<generate>\",\n \"description_narrative\": \"<generate>\",\n \"description_statistical\": \"<generate>\"\n}\n"
|
280 |
+
prompt += f"\nFor the narrative description, {complexity_levels[complexity]}"
|
281 |
+
|
282 |
+
return prompt
|
283 |
+
|
284 |
+
|
285 |
+
def generate_cluster_description(cluster_df, original_df=None, stats_list=['mean', 'min', 'max', 'std', 'kurt'], cluster_id = ""):
|
286 |
+
cluster_description = {"cluster_id": cluster_id,
|
287 |
+
"name":"<generate>",
|
288 |
+
"description_narrative":"<generate>",
|
289 |
+
"description_statistical":"<generate>",
|
290 |
+
"size": len(cluster_df),
|
291 |
+
"columns": {}
|
292 |
+
}
|
293 |
+
if original_df is not None:
|
294 |
+
size_relative = round(len(cluster_df)/len(original_df), 2)
|
295 |
+
for column in cluster_df.columns:
|
296 |
+
cluster_description["columns"][column] = {}
|
297 |
+
for stat in stats_list:
|
298 |
+
# Compute the statistic for the cluster
|
299 |
+
if stat == 'mean':
|
300 |
+
value = round(cluster_df[column].mean(),2)
|
301 |
+
elif stat == 'min':
|
302 |
+
value = round(cluster_df[column].min(),2)
|
303 |
+
elif stat == 'max':
|
304 |
+
value = round(cluster_df[column].max(),2)
|
305 |
+
elif stat == 'std':
|
306 |
+
value = round(cluster_df[column].std(), 2)
|
307 |
+
elif stat == 'kurt':
|
308 |
+
value = round(cluster_df[column].kurt(), 2)
|
309 |
+
|
310 |
+
# Compute the relative difference if the original dataframe is provided
|
311 |
+
if original_df is not None:
|
312 |
+
original_value = original_df[column].mean() if stat == 'mean' else original_df[column].min() if stat == 'min' else original_df[column].max() if stat == 'max' else original_df[column].std() if stat == 'std' else original_df[column].kurt()
|
313 |
+
relative_difference = (value - original_value) / original_value * 100
|
314 |
+
cluster_description["columns"][column][stat] = {"value": round(value,2), "relative_difference": f"{round(relative_difference,2)}%"}
|
315 |
+
else:
|
316 |
+
cluster_description["columns"][column][stat] = {"value": round(value,2)}
|
317 |
+
|
318 |
+
return cluster_description
|
319 |
+
|
320 |
+
|
321 |
+
|
322 |
+
|
323 |
+
def generate_cluster_description_mixed(cluster_df, original_df=None, stats_list=['mean', 'min', 'max', 'std', 'kurt'], cluster_id = ""):
|
324 |
+
cluster_description = {
|
325 |
+
"cluster_id": cluster_id,
|
326 |
+
"name":"<generate>",
|
327 |
+
"description_narrative":"<generate>",
|
328 |
+
"description_statistical":"<generate>",
|
329 |
+
"size": len(cluster_df),
|
330 |
+
"columns": {}
|
331 |
+
}
|
332 |
+
|
333 |
+
if original_df is not None:
|
334 |
+
size_relative = round(len(cluster_df)/len(original_df), 2)
|
335 |
+
|
336 |
+
# Create CSV string in memory
|
337 |
+
csv_io = io.StringIO()
|
338 |
+
writer = csv.writer(csv_io)
|
339 |
+
|
340 |
+
# CSV Headers
|
341 |
+
writer.writerow(['Column', 'Stat', 'Value', 'Relative_Difference'])
|
342 |
+
|
343 |
+
for column in cluster_df.columns:
|
344 |
+
for stat in stats_list:
|
345 |
+
if stat == 'mean':
|
346 |
+
value = round(cluster_df[column].mean(),2)
|
347 |
+
elif stat == 'min':
|
348 |
+
value = round(cluster_df[column].min(),2)
|
349 |
+
elif stat == 'max':
|
350 |
+
value = round(cluster_df[column].max(),2)
|
351 |
+
elif stat == 'std':
|
352 |
+
value = round(cluster_df[column].std(), 2)
|
353 |
+
elif stat == 'kurt':
|
354 |
+
value = round(cluster_df[column].kurt(), 2)
|
355 |
+
|
356 |
+
if original_df is not None:
|
357 |
+
original_value = original_df[column].mean() if stat == 'mean' else original_df[column].min() if stat == 'min' else original_df[column].max() if stat == 'max' else original_df[column].std() if stat == 'std' else original_df[column].kurt()
|
358 |
+
relative_difference = (value - original_value) / original_value * 100
|
359 |
+
writer.writerow([column, stat, value, f"{round(relative_difference,2)}%"])
|
360 |
+
else:
|
361 |
+
writer.writerow([column, stat, value, "N/A"])
|
362 |
+
|
363 |
+
# Store CSV data in JSON
|
364 |
+
cluster_description["columns"] = csv_io.getvalue()
|
365 |
+
|
366 |
+
data_description = """
|
367 |
+
The input data is a JSON object with details about clusters. It has the following structure:
|
368 |
+
|
369 |
+
1. 'cluster_id': An identifier for the cluster.
|
370 |
+
2. 'name': A placeholder for the name of the cluster.
|
371 |
+
3. 'description_narrative': A placeholder for a narrative description of the cluster.
|
372 |
+
4. 'description_statistical': A placeholder for a statistical description of the cluster.
|
373 |
+
5. 'size': The number of elements in the cluster.
|
374 |
+
6. 'columns': This contains statistical data about different aspects, presented in CSV format.
|
375 |
+
|
376 |
+
In the 'columns' CSV:
|
377 |
+
- 'Column' corresponds to the aspect.
|
378 |
+
- 'Stat' corresponds to the computed statistic for that aspect in the cluster.
|
379 |
+
- 'Value' is the value of that statistic.
|
380 |
+
- 'Relative_Difference' is the difference of the statistic's value compared to the average value of this statistic in the entire dataset, expressed in percentages.
|
381 |
+
"""
|
382 |
+
|
383 |
+
return cluster_description, data_description
|
384 |
+
|
385 |
+
# ==================================================================================================
|
386 |
+
# ========== TESTING ===============================================================================
|
387 |
+
|
388 |
+
def compare_column_names(ref_list, check_list):
|
389 |
+
"""
|
390 |
+
Compares two lists of column names to check for inconsistencies.
|
391 |
+
|
392 |
+
Args:
|
393 |
+
ref_list (list): The reference list of column names.
|
394 |
+
check_list (list): The list of column names to be checked.
|
395 |
+
|
396 |
+
Returns:
|
397 |
+
report_dict (dict): Report about the comparison process.
|
398 |
+
|
399 |
+
Raises:
|
400 |
+
ValueError: If the input types are not list.
|
401 |
+
"""
|
402 |
+
# Check the type of input data
|
403 |
+
if not all(isinstance(i, list) for i in [ref_list, check_list]):
|
404 |
+
raise ValueError("Both inputs must be of type list")
|
405 |
+
|
406 |
+
missing_cols = [col for col in ref_list if col not in check_list]
|
407 |
+
extra_cols = [col for col in check_list if col not in ref_list]
|
408 |
+
|
409 |
+
try:
|
410 |
+
typos = {}
|
411 |
+
for col in check_list:
|
412 |
+
if col not in ref_list:
|
413 |
+
similarity_scores = {ref_col: fuzz.ratio(col, ref_col) for ref_col in ref_list}
|
414 |
+
likely_match = max(similarity_scores, key=similarity_scores.get)
|
415 |
+
if similarity_scores[likely_match] > 70: # you may adjust this threshold as needed
|
416 |
+
typos[col] = likely_match
|
417 |
+
except:
|
418 |
+
typos = {"error":"fuzzywuzzy is probably not installed"}
|
419 |
+
|
420 |
+
report_dict = {
|
421 |
+
"missing_columns": missing_cols,
|
422 |
+
"extra_columns": extra_cols,
|
423 |
+
"likely_typos": typos
|
424 |
+
}
|
425 |
+
|
426 |
+
print("\nREPORT:")
|
427 |
+
print('-'*50)
|
428 |
+
print("\n- Missing columns:")
|
429 |
+
print(' ' + '\n '.join(f'"{col}"' for col in missing_cols) if missing_cols else ' None')
|
430 |
+
print("\n- Extra columns:")
|
431 |
+
print(' ' + '\n '.join(f'"{col}"' for col in extra_cols) if extra_cols else ' None')
|
432 |
+
print("\n- Likely typos:")
|
433 |
+
if typos:
|
434 |
+
for k, v in typos.items():
|
435 |
+
print(f' "{k}": "{v}"')
|
436 |
+
else:
|
437 |
+
print(' None')
|
438 |
+
|
439 |
+
return report_dict
|
440 |
+
|
441 |
+
|
442 |
+
def compare_dataframes(df1, df2, threshold=0.1):
|
443 |
+
"""
|
444 |
+
Compare two pandas DataFrame and returns a report highlighting any significant differences.
|
445 |
+
Significant differences are defined as differences that exceed the specified threshold.
|
446 |
+
|
447 |
+
Args:
|
448 |
+
df1, df2 (pandas.DataFrame): Input dataframes to be compared.
|
449 |
+
threshold (float): The percentage difference to be considered significant. Defaults to 0.1 (10%).
|
450 |
+
|
451 |
+
Returns:
|
452 |
+
pandas.DataFrame: A report highlighting the differences between df1 and df2.
|
453 |
+
"""
|
454 |
+
# Column comparison
|
455 |
+
cols_df1 = set(df1.columns)
|
456 |
+
cols_df2 = set(df2.columns)
|
457 |
+
|
458 |
+
common_cols = cols_df1 & cols_df2
|
459 |
+
missing_df1 = cols_df2 - cols_df1
|
460 |
+
missing_df2 = cols_df1 - cols_df2
|
461 |
+
|
462 |
+
print("Column Comparison:")
|
463 |
+
print("------------------")
|
464 |
+
print(f"Common columns ({len(common_cols)}): {sorted(list(common_cols)) if common_cols else 'None'}")
|
465 |
+
print(f"Columns missing in df1 ({len(missing_df1)}): {sorted(list(missing_df1)) if missing_df1 else 'None'}")
|
466 |
+
print(f"Columns missing in df2 ({len(missing_df2)}): {sorted(list(missing_df2)) if missing_df2 else 'None'}")
|
467 |
+
print("\n")
|
468 |
+
|
469 |
+
# Check for new null values
|
470 |
+
print("Null Values Check:")
|
471 |
+
print("------------------")
|
472 |
+
inconsistent_values_cols = []
|
473 |
+
inconsistent_ranges_cols = []
|
474 |
+
constant_cols = []
|
475 |
+
|
476 |
+
for col in common_cols:
|
477 |
+
nulls1 = df1[col].isnull().sum()
|
478 |
+
nulls2 = df2[col].isnull().sum()
|
479 |
+
if nulls1 == 0 and nulls2 > 0:
|
480 |
+
print(f"New null values detected in '{col}' of df2.")
|
481 |
+
|
482 |
+
# Check for value consistency
|
483 |
+
if df1[col].nunique() <= 10 and df2[col].nunique() <= 10:
|
484 |
+
inconsistent_values_cols.append(col)
|
485 |
+
|
486 |
+
|
487 |
+
# Check for range consistency
|
488 |
+
if df1[col].dtype.kind in 'if' and df2[col].dtype.kind in 'if':
|
489 |
+
range1 = df1[col].max() - df1[col].min()
|
490 |
+
range2 = df2[col].max() - df2[col].min()
|
491 |
+
diff = abs(range1 - range2)
|
492 |
+
mean_range = (range1 + range2) / 2
|
493 |
+
if diff / mean_range * 100 > threshold * 100:
|
494 |
+
inconsistent_ranges_cols.append(col)
|
495 |
+
|
496 |
+
# Check for constant columns
|
497 |
+
if len(df1[col].unique()) == 1 or len(df2[col].unique()) == 1:
|
498 |
+
constant_cols.append(col)
|
499 |
+
|
500 |
+
# Print out the results of value consistency, range consistency, and constant columns check
|
501 |
+
print("\nValue Consistency Check:")
|
502 |
+
print("------------------------")
|
503 |
+
print(f"Columns with inconsistent values (checks if the unique values are the same in both dataframes): {inconsistent_values_cols if inconsistent_values_cols else 'None'}")
|
504 |
+
|
505 |
+
print("\nRange Consistency Check (checks if the range (max - min) of the values in both dataframes is consistent):")
|
506 |
+
print("------------------------")
|
507 |
+
print(f"Columns with inconsistent ranges: {inconsistent_ranges_cols if inconsistent_ranges_cols else 'None'}")
|
508 |
+
|
509 |
+
print("\nConstant Columns Check (columns that have constant values in either dataframe):")
|
510 |
+
print("-----------------------")
|
511 |
+
print(f"Constant columns: {constant_cols if constant_cols else 'None'}")
|
512 |
+
|
513 |
+
# Check for changes in data type
|
514 |
+
print("\nData Type Check:")
|
515 |
+
print("----------------")
|
516 |
+
for col in common_cols:
|
517 |
+
dtype1 = df1[col].dtype
|
518 |
+
dtype2 = df2[col].dtype
|
519 |
+
if dtype1 != dtype2:
|
520 |
+
print(f"df1 '{dtype1}' -> '{dtype2}' in df2, Data type for '{col}' has changed.")
|
521 |
+
print("\n")
|
522 |
+
|
523 |
+
|
524 |
+
|
525 |
+
report_dict = {"column": [], "statistic": [], "df1": [], "df2": [], "diff%": []}
|
526 |
+
statistics = ["mean", "std", "min", "25%", "75%", "max", "nulls", "outliers"]
|
527 |
+
|
528 |
+
for col in common_cols:
|
529 |
+
if df1[col].dtype in ['int64', 'float64'] and df2[col].dtype in ['int64', 'float64']:
|
530 |
+
desc1 = df1[col].describe()
|
531 |
+
desc2 = df2[col].describe()
|
532 |
+
for stat in statistics[:-2]:
|
533 |
+
report_dict["column"].append(col)
|
534 |
+
report_dict["statistic"].append(stat)
|
535 |
+
report_dict["df1"].append(desc1[stat])
|
536 |
+
report_dict["df2"].append(desc2[stat])
|
537 |
+
diff = abs(desc1[stat] - desc2[stat])
|
538 |
+
mean = (desc1[stat] + desc2[stat]) / 2
|
539 |
+
report_dict["diff%"].append(diff / mean * 100 if mean != 0 else 0) # Fix for division by zero
|
540 |
+
nulls1 = df1[col].isnull().sum()
|
541 |
+
nulls2 = df2[col].isnull().sum()
|
542 |
+
outliers1 = df1[(df1[col] < desc1["25%"] - 1.5 * (desc1["75%"] - desc1["25%"])) |
|
543 |
+
(df1[col] > desc1["75%"] + 1.5 * (desc1["75%"] - desc1["25%"]))][col].count()
|
544 |
+
outliers2 = df2[(df2[col] < desc2["25%"] - 1.5 * (desc2["75%"] - desc2["25%"])) |
|
545 |
+
(df2[col] > desc2["75%"] + 1.5 * (desc2["75%"] - desc2["25%"]))][col].count()
|
546 |
+
for stat, value1, value2 in zip(statistics[-2:], [nulls1, outliers1], [nulls2, outliers2]):
|
547 |
+
report_dict["column"].append(col)
|
548 |
+
report_dict["statistic"].append(stat)
|
549 |
+
report_dict["df1"].append(value1)
|
550 |
+
report_dict["df2"].append(value2)
|
551 |
+
diff = abs(value1 - value2)
|
552 |
+
mean = (value1 + value2) / 2
|
553 |
+
report_dict["diff%"].append(diff / mean * 100 if mean != 0 else 0) # Fix for division by zero
|
554 |
+
|
555 |
+
report_df = pd.DataFrame(report_dict)
|
556 |
+
report_df["significant"] = report_df["diff%"] > threshold * 100
|
557 |
+
report_df = report_df[report_df["significant"]]
|
558 |
+
report_df = report_df.round(2)
|
559 |
+
|
560 |
+
print(f"REPORT:\n{'-'*50}")
|
561 |
+
for col in report_df["column"].unique():
|
562 |
+
print(f"\n{'='*50}")
|
563 |
+
print(f"Column: {col}\n{'='*50}")
|
564 |
+
subset = report_df[report_df["column"]==col][["statistic", "df1", "df2", "diff%"]]
|
565 |
+
subset.index = subset["statistic"]
|
566 |
+
print(subset.to_string(header=True))
|
567 |
+
|
568 |
+
return report_df
|
569 |
+
|
570 |
+
|
571 |
+
def notion_db_as_df(database_id, token):
|
572 |
+
base_url = "https://api.notion.com/v1"
|
573 |
+
|
574 |
+
# Headers for API requests
|
575 |
+
headers = {
|
576 |
+
"Authorization": f"Bearer {token}",
|
577 |
+
"Notion-Version": "2022-06-28",
|
578 |
+
"Content-Type": "application/json"
|
579 |
+
}
|
580 |
+
|
581 |
+
response = requests.post(f"{base_url}/databases/{database_id}/query", headers=headers)
|
582 |
+
# response.raise_for_status() # Uncomment to raise an exception for HTTP errors
|
583 |
+
pages = response.json().get('results', [])
|
584 |
+
print(response.json().keys())
|
585 |
+
|
586 |
+
# Used to create df
|
587 |
+
table_data = {}
|
588 |
+
page_cnt = len(pages)
|
589 |
+
for i, page in enumerate(pages):
|
590 |
+
for cur_col, val in page["properties"].items():
|
591 |
+
if cur_col not in table_data:
|
592 |
+
table_data[cur_col] = [None] * page_cnt
|
593 |
+
val_type = val["type"]
|
594 |
+
if val_type == "title":
|
595 |
+
value = val[val_type][0]["text"]["content"]
|
596 |
+
elif val_type in ["number", "checkbox"]:
|
597 |
+
value = val[val_type]
|
598 |
+
elif val_type in ["select", "multi_select"]:
|
599 |
+
value = ', '.join([option["name"] for option in val[val_type]])
|
600 |
+
elif val_type == "date":
|
601 |
+
value = val[val_type]["start"]
|
602 |
+
elif val_type in ["people", "files"]:
|
603 |
+
value = ', '.join([item["id"] for item in val[val_type]])
|
604 |
+
elif val_type in ["url", "email", "phone_number"]:
|
605 |
+
value = val[val_type]
|
606 |
+
elif val_type == "formula":
|
607 |
+
value = val[val_type]["string"] if "string" in val[val_type] else val[val_type]["number"]
|
608 |
+
elif val_type == "rich_text":
|
609 |
+
value = val[val_type][0]["text"]["content"]
|
610 |
+
else:
|
611 |
+
value = str(val[val_type]) # Fallback to string representation
|
612 |
+
table_data[cur_col][i] = value
|
613 |
+
|
614 |
+
# To DataFrame
|
615 |
+
df = pd.DataFrame(table_data)
|
616 |
+
return df
|
speckleUtils/plots_utils.py
ADDED
@@ -0,0 +1,814 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import pandas as pd
|
3 |
+
import seaborn as sns
|
4 |
+
import matplotlib.pyplot as plt
|
5 |
+
|
6 |
+
import numpy as np
|
7 |
+
import math
|
8 |
+
|
9 |
+
import matplotlib.pyplot as plt
|
10 |
+
import matplotlib.patches as patches
|
11 |
+
import matplotlib.colors as colors
|
12 |
+
from matplotlib.colors import ListedColormap, LinearSegmentedColormap, Normalize
|
13 |
+
from matplotlib.cm import ScalarMappable
|
14 |
+
import pandas as pd
|
15 |
+
import numpy as np
|
16 |
+
from pandas.api.types import is_numeric_dtype
|
17 |
+
from mpl_toolkits.axes_grid1 import make_axes_locatable
|
18 |
+
|
19 |
+
from sklearn.metrics import r2_score
|
20 |
+
|
21 |
+
def cleanData(data, mode="drop", num_only=False):
|
22 |
+
"""
|
23 |
+
This function cleans the input data based on the specified mode.
|
24 |
+
|
25 |
+
Parameters:
|
26 |
+
data (pd.DataFrame, pd.Series, or np.ndarray): The input data to be cleaned.
|
27 |
+
mode (str, optional): The cleaning method, one of "drop", "replace_zero", or "replace_mean".
|
28 |
+
"drop" removes NaN values,
|
29 |
+
"replace_zero" replaces NaN values with zeros,
|
30 |
+
"replace_mean" replaces NaN values with the mean of the data.
|
31 |
+
Defaults to "drop".
|
32 |
+
num_only (bool, optional): If True and data is a DataFrame, only integer and float columns are kept.
|
33 |
+
Defaults to False.
|
34 |
+
|
35 |
+
Returns:
|
36 |
+
data (same type as input): The cleaned data.
|
37 |
+
|
38 |
+
The function works with pandas DataFrame, Series, and numpy array. Depending on the 'mode' argument,
|
39 |
+
it either drops the NaN values, replaces them with zero, or replaces them with the mean of the data.
|
40 |
+
If the data is a DataFrame and num_only is set to True, the function only keeps the columns with
|
41 |
+
numeric data (int64 and float64 dtypes).
|
42 |
+
"""
|
43 |
+
# check the type of input data
|
44 |
+
if isinstance(data, pd.DataFrame):
|
45 |
+
if num_only:
|
46 |
+
data = data.select_dtypes(include=['int64', 'float64'])
|
47 |
+
else:
|
48 |
+
data_copy = data.copy()
|
49 |
+
for col in data.columns:
|
50 |
+
data[col] = pd.to_numeric(data[col], errors='coerce')
|
51 |
+
data[col].fillna(data_copy[col], inplace=True)
|
52 |
+
|
53 |
+
if mode == "drop":
|
54 |
+
data = data.dropna()
|
55 |
+
elif mode=="replace_zero":
|
56 |
+
data = data.fillna(0)
|
57 |
+
elif mode=="replace_mean":
|
58 |
+
data = data.fillna(data.mean())
|
59 |
+
|
60 |
+
elif isinstance(data, pd.Series):
|
61 |
+
if mode == "drop":
|
62 |
+
data = data.dropna()
|
63 |
+
elif mode=="replace_zero":
|
64 |
+
data = data.fillna(0)
|
65 |
+
elif mode=="replace_mean":
|
66 |
+
data = data.fillna(data.mean())
|
67 |
+
|
68 |
+
elif isinstance(data, np.ndarray):
|
69 |
+
if mode=="drop":
|
70 |
+
data = data[~np.isnan(data)]
|
71 |
+
elif mode=="replace_zero":
|
72 |
+
data = np.nan_to_num(data, nan=0)
|
73 |
+
elif mode=="replace_mean":
|
74 |
+
data = np.where(np.isnan(data), np.nanmean(data), data)
|
75 |
+
|
76 |
+
else:
|
77 |
+
raise ValueError("Unsupported data type")
|
78 |
+
|
79 |
+
return data
|
80 |
+
|
81 |
+
def boxPlot(inp_data, columName, cull_invalid=True):
|
82 |
+
"""
|
83 |
+
This function generates a boxplot for a given set of data.
|
84 |
+
|
85 |
+
Parameters:
|
86 |
+
inp_data (array or list): Input data for which the boxplot is to be created.
|
87 |
+
columName (str): The name of the column which the data represents, to be used as title for the boxplot.
|
88 |
+
cull_invalid (bool, optional): If True, invalid entries in the data are dropped. Defaults to True.
|
89 |
+
|
90 |
+
Returns:
|
91 |
+
fig (matplotlib Figure object): Figure containing the boxplot.
|
92 |
+
ax (matplotlib Axes object): Axes of the created boxplot.
|
93 |
+
|
94 |
+
The function creates a boxplot of the provided data, marking the 25th, 50th, and 75th percentiles.
|
95 |
+
The style of the boxplot is custom, with specific colors and properties for different boxplot elements.
|
96 |
+
The figure title is set to the provided column name.
|
97 |
+
"""
|
98 |
+
if cull_invalid == True:
|
99 |
+
inp_data = cleanData(inp_data, mode="drop", num_only=True)
|
100 |
+
|
101 |
+
# Create a new figure
|
102 |
+
fig, ax = plt.subplots(figsize=(10,3), dpi=200)
|
103 |
+
|
104 |
+
# Set the style to white background
|
105 |
+
sns.set_style("white")
|
106 |
+
|
107 |
+
# Calculate the min, max, Q1, and Q3 of the data
|
108 |
+
min_val = np.min(inp_data)
|
109 |
+
max_val = np.max(inp_data)
|
110 |
+
Q1 = np.percentile(inp_data, 25)
|
111 |
+
Q3 = np.percentile(inp_data, 75)
|
112 |
+
mean_val = np.mean(inp_data)
|
113 |
+
|
114 |
+
# Define the positions and labels for the x ticks
|
115 |
+
x_ticks = [] #[min_val, mean_val, Q3, max_val]
|
116 |
+
x_tick_labels =[] #[ round(v,1) for v in x_ticks]
|
117 |
+
|
118 |
+
# Add vertical lines at mean and Q3
|
119 |
+
ax.vlines([mean_val], ymin=-0.35, ymax=0.35, colors='black', linewidth=3)
|
120 |
+
ax.text(mean_val, -0.35, ' mean', ha='left', va='top', fontsize=14)
|
121 |
+
|
122 |
+
# Define the properties for the boxplot elements
|
123 |
+
boxprops = {'edgecolor': 'black', 'linewidth': 2, 'facecolor': 'white', 'alpha':0.5}
|
124 |
+
medianprops = {'color': 'gray', 'linewidth': 0}
|
125 |
+
whiskerprops = {'color': 'black', 'linewidth': 1}
|
126 |
+
capprops = {'color': 'black', 'linewidth': 2}
|
127 |
+
flierprops = {'marker':'o', 'markersize':3, 'color':'white', 'markerfacecolor':'lightgray'}
|
128 |
+
meanprops = {'color': 'black', 'linewidth': 1.0}
|
129 |
+
kwargs = {'meanline': True, 'showmeans': True}
|
130 |
+
|
131 |
+
# Create the boxplot
|
132 |
+
bplot = sns.boxplot(x=inp_data,
|
133 |
+
boxprops=boxprops,
|
134 |
+
medianprops=medianprops,
|
135 |
+
whiskerprops=whiskerprops,
|
136 |
+
capprops=capprops,
|
137 |
+
flierprops=flierprops,
|
138 |
+
meanprops=meanprops,
|
139 |
+
width=0.3,
|
140 |
+
ax=ax,
|
141 |
+
**kwargs
|
142 |
+
)
|
143 |
+
|
144 |
+
# Set the figure title and place it on the top left corner
|
145 |
+
ax.set_title(columName, loc='left', color="lightgrey", alpha =0.2)
|
146 |
+
|
147 |
+
# Remove the black outline from the figure
|
148 |
+
for spine in ax.spines.values():
|
149 |
+
spine.set_visible(False)
|
150 |
+
|
151 |
+
# Set the x-axis ticks and labels
|
152 |
+
ax.set_xticks(x_ticks)
|
153 |
+
ax.set_xticklabels(x_tick_labels)
|
154 |
+
|
155 |
+
# Remove the x-axis label
|
156 |
+
ax.set_xlabel('')
|
157 |
+
|
158 |
+
return fig, ax
|
159 |
+
|
160 |
+
|
161 |
+
|
162 |
+
def boxPlot_colorbar(inp_data, columName, cull_invalid=True, color = ['blue', 'red']):
|
163 |
+
"""
|
164 |
+
This function creates a boxplot with an integrated colorbar for a given set of data.
|
165 |
+
|
166 |
+
Parameters:
|
167 |
+
inp_data (array or list): Input data for which the boxplot is to be created.
|
168 |
+
columName (str): The name of the column which the data represents, to be used as title for the boxplot.
|
169 |
+
cull_invalid (bool, optional): If True, invalid entries in the data are dropped. Defaults to True.
|
170 |
+
color (list of str, optional): List of colors to use for the gradient colorbar. Defaults to ['blue', 'red'].
|
171 |
+
|
172 |
+
Returns:
|
173 |
+
fig (matplotlib Figure object): Figure containing the boxplot.
|
174 |
+
ax (matplotlib Axes object): Axes of the created boxplot.
|
175 |
+
|
176 |
+
The function creates a boxplot of the provided data, marking the 25th, 50th, and 75th percentiles.
|
177 |
+
It also creates a horizontal colorbar above the boxplot that serves as a gradient from the minimum
|
178 |
+
to the maximum values of the data, emphasizing the data distribution.
|
179 |
+
"""
|
180 |
+
if cull_invalid == True:
|
181 |
+
inp_data = cleanData(inp_data, mode="drop", num_only=True)
|
182 |
+
|
183 |
+
# Create a new figure
|
184 |
+
fig, (cax, ax) = plt.subplots(nrows=2, figsize=(10,3), dpi=75,
|
185 |
+
gridspec_kw={'height_ratios': [0.1, 1], 'hspace': 0.02}) # Adjust hspace for less space between plots
|
186 |
+
|
187 |
+
|
188 |
+
# Set the style to white background
|
189 |
+
sns.set_style("white")
|
190 |
+
|
191 |
+
# Calculate the min, max, Q1, and Q3 of the data
|
192 |
+
min_val = np.min(inp_data)
|
193 |
+
max_val = np.max(inp_data)
|
194 |
+
Q1 = np.percentile(inp_data, 25)
|
195 |
+
Q3 = np.percentile(inp_data, 75)
|
196 |
+
mean_val = np.mean(inp_data)
|
197 |
+
|
198 |
+
ratio = int(np.ceil((Q3 - min_val) / (max_val - min_val) * 100))
|
199 |
+
|
200 |
+
# Create a custom colormap
|
201 |
+
cmap1 = LinearSegmentedColormap.from_list("mycmap", color)
|
202 |
+
colors = np.concatenate((cmap1(np.linspace(0, 1, ratio)), np.repeat([cmap1(1.)], 100 - ratio, axis=0)))
|
203 |
+
cmap2 = ListedColormap(colors)
|
204 |
+
|
205 |
+
norm = Normalize(vmin=min_val, vmax=max_val)
|
206 |
+
sm = ScalarMappable(norm=norm, cmap=cmap2)
|
207 |
+
|
208 |
+
# Draw a vertical line at Q3
|
209 |
+
cax.axvline(Q3*0.97, color='k', linewidth=3)
|
210 |
+
cbar = fig.colorbar(sm, cax=cax, orientation='horizontal', ticks=[])
|
211 |
+
|
212 |
+
# Define the positions and labels for the x ticks
|
213 |
+
x_ticks = [] #[min_val, mean_val, Q3, max_val]
|
214 |
+
x_tick_labels =[] #[ round(v,1) for v in x_ticks]
|
215 |
+
|
216 |
+
# Add vertical lines at mean and Q3
|
217 |
+
ax.vlines([Q3], ymin=-0.35, ymax=0.35, colors='black', linewidth=3)
|
218 |
+
ax.text(Q3, 0.83, ' 75th percentile', ha='left', va='top', transform=ax.get_xaxis_transform(), fontsize=14)
|
219 |
+
|
220 |
+
|
221 |
+
# Define the properties for the boxplot elements
|
222 |
+
boxprops = {'edgecolor': 'black', 'linewidth': 2, 'facecolor': 'white', 'alpha':0.5}
|
223 |
+
medianprops = {'color': 'gray', 'linewidth': 0}
|
224 |
+
whiskerprops = {'color': 'black', 'linewidth': 1}
|
225 |
+
capprops = {'color': 'black', 'linewidth': 2}
|
226 |
+
flierprops = {'marker':'o', 'markersize':3, 'color':'white', 'markerfacecolor':'lightgray'}
|
227 |
+
meanprops = {'color': 'black', 'linewidth': 1.0}
|
228 |
+
kwargs = {'meanline': True, 'showmeans': True}
|
229 |
+
|
230 |
+
# Create the boxplo
|
231 |
+
bplot = sns.boxplot(x=inp_data,
|
232 |
+
boxprops=boxprops,
|
233 |
+
medianprops=medianprops,
|
234 |
+
whiskerprops=whiskerprops,
|
235 |
+
capprops=capprops,
|
236 |
+
flierprops=flierprops,
|
237 |
+
meanprops=meanprops,
|
238 |
+
width=0.3,
|
239 |
+
ax=ax,
|
240 |
+
**kwargs
|
241 |
+
)
|
242 |
+
|
243 |
+
# Set the figure title and place it on the top left corner
|
244 |
+
ax.set_title(columName, loc='left', color="lightgrey", alpha=0.2)
|
245 |
+
|
246 |
+
# Remove the black outline from the figure
|
247 |
+
for spine in ax.spines.values():
|
248 |
+
spine.set_visible(False)
|
249 |
+
|
250 |
+
# Set the x-axis ticks and labels
|
251 |
+
ax.set_xticks(x_ticks)
|
252 |
+
ax.set_xticklabels(x_tick_labels)
|
253 |
+
|
254 |
+
# Remove the x-axis label
|
255 |
+
ax.set_xlabel('')
|
256 |
+
|
257 |
+
return fig, ax
|
258 |
+
|
259 |
+
|
260 |
+
|
261 |
+
|
262 |
+
|
263 |
+
|
264 |
+
def histogramScore(inp_data,columName, cull_invalid=True):
|
265 |
+
# Create a new figure
|
266 |
+
if cull_invalid:
|
267 |
+
inp_data = cleanData(inp_data, mode="drop", num_only=True)
|
268 |
+
|
269 |
+
fig, ax = plt.subplots()
|
270 |
+
|
271 |
+
# Set the style to white background
|
272 |
+
sns.set_style("white")
|
273 |
+
|
274 |
+
# Create the histogram with an automatic number of bins
|
275 |
+
ax.hist(inp_data, edgecolor='black', facecolor=(0.99,0.99,0.99,1), bins='auto')
|
276 |
+
|
277 |
+
# Remove the black outline from the figure
|
278 |
+
for spine in ax.spines.values():
|
279 |
+
spine.set_visible(False)
|
280 |
+
|
281 |
+
# Make the y-axis visible
|
282 |
+
ax.spines['left'].set_visible(True)
|
283 |
+
ax.spines['left'].set_color("lightgrey")
|
284 |
+
ax.spines['bottom'].set_visible(True)
|
285 |
+
ax.spines['bottom'].set_color("lightgrey")
|
286 |
+
|
287 |
+
# Calculate the min, max, Q1, and Q3 of the data
|
288 |
+
min_val = np.min(inp_data)
|
289 |
+
max_val = np.max(inp_data)
|
290 |
+
Q1 = np.percentile(inp_data, 25)
|
291 |
+
Q3 = np.percentile(inp_data, 75)
|
292 |
+
mean_val = np.mean(inp_data)
|
293 |
+
|
294 |
+
|
295 |
+
|
296 |
+
# Calculate two equally spaced values on either side of the mean
|
297 |
+
step = (mean_val - min_val) / 2
|
298 |
+
xticks = [mean_val - 2*step, mean_val - step, mean_val, max_val]
|
299 |
+
xticks = [ round(v,1) for v in xticks]
|
300 |
+
|
301 |
+
ax.set_xticks(xticks)
|
302 |
+
|
303 |
+
# Add a dotted line at the mean value
|
304 |
+
ax.axvline(x=mean_val, ymax=0.85, linestyle='dotted', color='black')
|
305 |
+
|
306 |
+
# Add a text tag at the end of the line
|
307 |
+
ax.text(mean_val, ax.get_ylim()[1] * 0.98,"Mean", weight = "bold", size=22, ha="center",
|
308 |
+
bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.2'))
|
309 |
+
ax.text(mean_val, ax.get_ylim()[1] * 0.85, str(round(mean_val,1)) + " from " + str(round(max_val,1)), ha='center', va='bottom', size=22,
|
310 |
+
bbox=dict(facecolor='white', edgecolor='white', boxstyle='round,pad=0.2'))
|
311 |
+
|
312 |
+
# Set the figure title and place it on the top left corner
|
313 |
+
ax.set_title(columName, loc='left', color="lightgrey", alpha=0.3)
|
314 |
+
|
315 |
+
# Make the y-axis tick labels smaller
|
316 |
+
ax.tick_params(axis='y', labelsize=8)
|
317 |
+
|
318 |
+
# Remove the x-axis label
|
319 |
+
ax.set_xlabel('')
|
320 |
+
|
321 |
+
|
322 |
+
return fig, ax
|
323 |
+
|
324 |
+
|
325 |
+
# =============================================================================
|
326 |
+
#==============================================================================
|
327 |
+
|
328 |
+
|
329 |
+
def get_drawing_order(dataset, order_of_importance, sorting_direction):
|
330 |
+
# for activity nodes
|
331 |
+
temp_dataset = dataset.copy()
|
332 |
+
temp_dataset[['id1', 'id2', 'id3']] = temp_dataset['ids'].str.split(';', expand=True).astype(int)
|
333 |
+
columns_ordered = [f'id{i}' for i in order_of_importance]
|
334 |
+
sorting_direction_ordered = [direction == '+' for direction in sorting_direction]
|
335 |
+
drawing_order = temp_dataset.sort_values(columns_ordered, ascending=sorting_direction_ordered).index.tolist()
|
336 |
+
return drawing_order
|
337 |
+
|
338 |
+
|
339 |
+
def calculate_aspect_ratio(all_x_coords, all_y_coords):
|
340 |
+
x_range = max(all_x_coords) - min(all_x_coords)
|
341 |
+
y_range = max(all_y_coords) - min(all_y_coords)
|
342 |
+
aspect_ratio = y_range / x_range
|
343 |
+
size = 15
|
344 |
+
return (size, aspect_ratio) if aspect_ratio > 1 else (size / aspect_ratio, size)
|
345 |
+
|
346 |
+
|
347 |
+
def create_colorbar(fig, ax, dataset, coloring_col, cmap, title="", cb_positioning=[0.9, 0.4, 0.02, 0.38],
|
348 |
+
tick_unit="", normalize_override=("min", "max")):
|
349 |
+
|
350 |
+
divider = make_axes_locatable(ax)
|
351 |
+
divider.append_axes("right", size="2%", pad=5.55)
|
352 |
+
|
353 |
+
# Determine normalization values
|
354 |
+
if normalize_override[0] == "min":
|
355 |
+
vmin = dataset[coloring_col].min()
|
356 |
+
else:
|
357 |
+
vmin = normalize_override[0]
|
358 |
+
|
359 |
+
if normalize_override[1] == "max":
|
360 |
+
vmax = dataset[coloring_col].max()
|
361 |
+
else:
|
362 |
+
vmax = normalize_override[1]
|
363 |
+
|
364 |
+
sm = plt.cm.ScalarMappable(cmap=cmap, norm=plt.Normalize(vmin=vmin, vmax=vmax))
|
365 |
+
|
366 |
+
colorbar_ax = fig.add_axes(cb_positioning)
|
367 |
+
colorbar = fig.colorbar(sm, cax=colorbar_ax)
|
368 |
+
|
369 |
+
min_tick = vmin
|
370 |
+
max_tick = vmax
|
371 |
+
colorbar.set_ticks([min_tick*1.05, max_tick*0.95])
|
372 |
+
colorbar.ax.set_yticklabels([
|
373 |
+
str(round(min_tick,1))+" " +tick_unit,
|
374 |
+
str(round(max_tick,1)) + " " +tick_unit
|
375 |
+
])
|
376 |
+
colorbar.ax.tick_params(labelsize=44)
|
377 |
+
|
378 |
+
|
379 |
+
colorbar.ax.annotate(title , xy=(0.55, 1.1), xycoords='axes fraction', fontsize=44,
|
380 |
+
xytext=(-45, 15), textcoords='offset points',
|
381 |
+
ha='left', va='bottom')
|
382 |
+
|
383 |
+
for a in fig.axes:
|
384 |
+
if a is not ax and a is not colorbar_ax:
|
385 |
+
a.axis('off')
|
386 |
+
|
387 |
+
return sm, colorbar
|
388 |
+
|
389 |
+
|
390 |
+
|
391 |
+
def draw_polygons(ax, dataset, x_cord_name, y_cord_name, style_dict, sm=None, drawing_order=None, cmap=None, coloring_col=None):
|
392 |
+
"""
|
393 |
+
This function draws polygons on a given axes object based on coordinates defined in the dataset.
|
394 |
+
|
395 |
+
Parameters:
|
396 |
+
ax (matplotlib.axes.Axes): The axes object on which to draw the polygons.
|
397 |
+
dataset (pd.DataFrame): The input DataFrame containing the coordinates of the polygons.
|
398 |
+
x_cord_name (str): The name of the column in the dataset that contains the x-coordinates.
|
399 |
+
y_cord_name (str): The name of the column in the dataset that contains the y-coordinates.
|
400 |
+
style_dict (dict): A dictionary defining the style parameters for the polygons.
|
401 |
+
sm (matplotlib.cm.ScalarMappable, optional): The scalar mappable object used for mapping normalized data to RGBA.
|
402 |
+
drawing_order (list, optional): A list of indices defining the order in which to draw the polygons.
|
403 |
+
cmap (matplotlib.colors.Colormap, optional): The colormap to use for coloring the polygons.
|
404 |
+
coloring_col (str, optional): The name of the column in the dataset that contains the coloring values for the polygons.
|
405 |
+
|
406 |
+
Returns:
|
407 |
+
None
|
408 |
+
|
409 |
+
The function reads the x and y coordinates from the dataset and creates a polygon for each row.
|
410 |
+
If a scalar mappable and a colormap are provided, the polygons are colored accordingly.
|
411 |
+
The order in which the polygons are drawn can be specified with the drawing_order parameter.
|
412 |
+
If no order is specified, the polygons are drawn in the order they appear in the dataset.
|
413 |
+
"""
|
414 |
+
if drawing_order is None:
|
415 |
+
drawing_order = dataset.index
|
416 |
+
for idx in drawing_order:
|
417 |
+
row = dataset.loc[idx]
|
418 |
+
|
419 |
+
# If it's a string, convert to list, if list, use directly
|
420 |
+
if isinstance(row[x_cord_name], str) and len(row[x_cord_name]) > 2:
|
421 |
+
patch_x_list = [float(i) for i in row[x_cord_name][1:-1].split(",")]
|
422 |
+
elif isinstance(row[x_cord_name], list):
|
423 |
+
patch_x_list = row[x_cord_name]
|
424 |
+
|
425 |
+
if isinstance(row[y_cord_name], str) and len(row[y_cord_name]) > 2:
|
426 |
+
patch_y_list = [float(i) for i in row[y_cord_name][1:-1].split(",")]
|
427 |
+
elif isinstance(row[y_cord_name], list):
|
428 |
+
patch_y_list = row[y_cord_name]
|
429 |
+
|
430 |
+
# Check if the row is not None and the length is greater than 0
|
431 |
+
if patch_x_list is not None and patch_y_list is not None and len(patch_x_list) > 0 and len(patch_y_list) > 0:
|
432 |
+
try:
|
433 |
+
if patch_x_list[0] != patch_x_list[-1] and patch_y_list[0] != patch_y_list[-1]:
|
434 |
+
patch_x_list.append(patch_x_list[0])
|
435 |
+
patch_y_list.append(patch_y_list[0])
|
436 |
+
|
437 |
+
if sm is not None:
|
438 |
+
normalized_data = sm.norm(row[coloring_col])
|
439 |
+
polygon = patches.Polygon(np.column_stack((patch_x_list, patch_y_list)), **style_dict, facecolor=cmap(normalized_data))
|
440 |
+
|
441 |
+
else:
|
442 |
+
polygon = patches.Polygon(np.column_stack((patch_x_list, patch_y_list)), **style_dict)
|
443 |
+
|
444 |
+
ax.add_patch(polygon)
|
445 |
+
except Exception as e:
|
446 |
+
pass
|
447 |
+
#print(f"Error occurred: {e}")
|
448 |
+
|
449 |
+
|
450 |
+
def configure_plot(ax, all_x_coords, all_y_coords, buffer=0.03):
|
451 |
+
x_range = max(all_x_coords) - min(all_x_coords)
|
452 |
+
y_range = max(all_y_coords) - min(all_y_coords)
|
453 |
+
|
454 |
+
ax.set_aspect('equal')
|
455 |
+
ax.set_xlim([min(all_x_coords) - buffer*x_range, max(all_x_coords) + buffer*x_range])
|
456 |
+
ax.set_ylim([min(all_y_coords) - buffer*y_range, max(all_y_coords) + buffer*y_range])
|
457 |
+
ax.set_xticks([])
|
458 |
+
ax.set_yticks([])
|
459 |
+
for spine in ax.spines.values():
|
460 |
+
spine.set_visible(False)
|
461 |
+
|
462 |
+
|
463 |
+
# Main script
|
464 |
+
#dataset = dataset.dropna()
|
465 |
+
|
466 |
+
# column used for heatmap and colorbar
|
467 |
+
def createActivityNodePlot(dataset,
|
468 |
+
colorbar_title="",
|
469 |
+
color="coolwarm",
|
470 |
+
data_col=None,
|
471 |
+
cb_positioning = [0.9, 0.4, 0.02, 0.38],
|
472 |
+
draw_oder_instruction=['-', '-', '+'],
|
473 |
+
tick_unit="",
|
474 |
+
normalize_override=("min", "max")):
|
475 |
+
|
476 |
+
"""
|
477 |
+
This function creates an activity node plot using the provided dataset, and optionally includes a colorbar.
|
478 |
+
|
479 |
+
Parameters:
|
480 |
+
dataset (pd.DataFrame): The input DataFrame containing the data.
|
481 |
+
colorbar_title (str, optional): The title for the colorbar. Default is an empty string.
|
482 |
+
color (str or list, optional): The colormap for the plot. Can be a matplotlib colormap name or a list of colors. Default is "coolwarm".
|
483 |
+
data_col (str, optional): The name of the column in the dataset to use for coloring the nodes. If not provided, the first column of the dataset is used.
|
484 |
+
cb_positioning (list, optional): A list of four floats defining the position and size of the colorbar. Defaults to [0.9, 0.4, 0.02, 0.38].
|
485 |
+
draw_oder_instruction (list, optional): A list of strings defining the order in which to draw the polygons. Defaults to ['-', '-', '+'].
|
486 |
+
tick_unit (str, optional): The unit for the ticks on the colorbar. Default is an empty string.
|
487 |
+
|
488 |
+
Returns:
|
489 |
+
fig (matplotlib.figure.Figure): The created figure object.
|
490 |
+
ax (matplotlib.axes._subplots.AxesSubplot): The created Axes object.
|
491 |
+
|
492 |
+
The function creates an activity node plot with optional coloring based on a data column.
|
493 |
+
The plot includes polygons representing nodes, and optionally a colorbar.
|
494 |
+
The order in which the nodes are drawn can be specified.
|
495 |
+
The plot's aspect ratio is calculated based on the provided coordinates.
|
496 |
+
"""
|
497 |
+
|
498 |
+
if data_col == None:
|
499 |
+
coloring_col = dataset.columns[0]
|
500 |
+
else:
|
501 |
+
coloring_col = data_col
|
502 |
+
|
503 |
+
# not very elegant
|
504 |
+
all_x_coords = []
|
505 |
+
all_y_coords = []
|
506 |
+
|
507 |
+
for idx, row in dataset.iterrows():
|
508 |
+
# If it's a string, convert to list, if list, use directly
|
509 |
+
if isinstance(row["patches_x_AN"], str) and len(row["patches_x_AN"]) > 2:
|
510 |
+
patch_x_list = [float(i) for i in row["patches_x_AN"][1:-1].split(",")]
|
511 |
+
elif isinstance(row["patches_x_AN"], list):
|
512 |
+
patch_x_list = row["patches_x_AN"]
|
513 |
+
|
514 |
+
if isinstance(row["patches_y_AN"], str) and len(row["patches_y_AN"]) > 2:
|
515 |
+
patch_y_list = [float(i) for i in row["patches_y_AN"][1:-1].split(",")]
|
516 |
+
elif isinstance(row["patches_y_AN"], list):
|
517 |
+
patch_y_list = row["patches_y_AN"]
|
518 |
+
all_x_coords.extend(patch_x_list)
|
519 |
+
all_y_coords.extend(patch_y_list)
|
520 |
+
|
521 |
+
figsize = calculate_aspect_ratio(all_x_coords, all_y_coords)
|
522 |
+
fig, ax = plt.subplots(figsize=figsize)
|
523 |
+
|
524 |
+
# color map
|
525 |
+
if type(color) == type([]):
|
526 |
+
|
527 |
+
cmap = LinearSegmentedColormap.from_list('custom_color', color)
|
528 |
+
else:
|
529 |
+
cmap = plt.cm.get_cmap(color)
|
530 |
+
|
531 |
+
# Activity Node geometry
|
532 |
+
style_dict_an = {'linewidth': 1, 'edgecolor': "Black"}
|
533 |
+
|
534 |
+
color_data_exists = is_numeric_dtype(dataset[coloring_col])
|
535 |
+
|
536 |
+
if color_data_exists:
|
537 |
+
sm, colorbar = create_colorbar(fig, ax, dataset, coloring_col, cmap, colorbar_title,
|
538 |
+
cb_positioning = cb_positioning, tick_unit=tick_unit,
|
539 |
+
normalize_override=normalize_override)
|
540 |
+
drawing_order = get_drawing_order(dataset, [1, 3, 2], draw_oder_instruction)
|
541 |
+
|
542 |
+
draw_polygons(ax,
|
543 |
+
dataset,
|
544 |
+
"patches_x_AN",
|
545 |
+
"patches_y_AN",
|
546 |
+
style_dict_an,
|
547 |
+
sm,
|
548 |
+
drawing_order,
|
549 |
+
cmap,
|
550 |
+
coloring_col)
|
551 |
+
|
552 |
+
style_dict_bridges = {'linewidth': 1, 'edgecolor': "Black", 'facecolor':"Black"}
|
553 |
+
|
554 |
+
|
555 |
+
draw_polygons(ax,
|
556 |
+
dataset,
|
557 |
+
"patches_x_Bridges",
|
558 |
+
"patches_y_Bridges",
|
559 |
+
style_dict_bridges,
|
560 |
+
cmap,
|
561 |
+
coloring_col=coloring_col,
|
562 |
+
)
|
563 |
+
|
564 |
+
configure_plot(ax, all_x_coords, all_y_coords)
|
565 |
+
return fig, ax
|
566 |
+
|
567 |
+
|
568 |
+
|
569 |
+
|
570 |
+
def radar(values_norm,
|
571 |
+
labels,
|
572 |
+
color,
|
573 |
+
cluster_name,
|
574 |
+
factor=100,
|
575 |
+
ax_multi = None,
|
576 |
+
fig_multi=None,
|
577 |
+
label_font_size =6,
|
578 |
+
num_datapoints=None):
|
579 |
+
|
580 |
+
"""
|
581 |
+
This function creates a radar chart (also known as a spider or star chart) from given normalized values and labels.
|
582 |
+
|
583 |
+
Parameters:
|
584 |
+
values_norm (list of numbers): Normalized values to plot on the radar chart, these values will be scaled within the function.
|
585 |
+
labels (list of str): Labels for the axes of the radar chart.
|
586 |
+
color (str): Color of the fill and outline on the radar chart.
|
587 |
+
cluster_name (str): Title for the radar chart.
|
588 |
+
factor (int, optional): Scaling factor for the data, defaults to 100.
|
589 |
+
ax_multi (matplotlib Axes object, optional): Predefined matplotlib Axes. If None, a new Axes object is created.
|
590 |
+
fig_multi (matplotlib Figure object, optional): Predefined matplotlib Figure for the plot. If None, a new Figure is created.
|
591 |
+
label_font_size (int, optional): Font size for the axis labels, defaults to 6.
|
592 |
+
num_datapoints (int, optional): Number of datapoints used to calculate the values, will be displayed in the plot if provided.
|
593 |
+
|
594 |
+
Returns:
|
595 |
+
fig (matplotlib Figure object): Figure containing the radar chart.
|
596 |
+
ax (matplotlib Axes object): Axes of the created radar chart.
|
597 |
+
|
598 |
+
This function plots each value from 'values_norm' as an axis on the radar chart,
|
599 |
+
the aesthetics of the plot such as color and font size are customizable. The chart
|
600 |
+
is scaled using the provided factor. 'values_norm' should be preprocessed outside
|
601 |
+
of this function: they should be the mean values of your original data, normalized
|
602 |
+
to be between 0 and 1.
|
603 |
+
"""
|
604 |
+
|
605 |
+
# ax = plt.subplot(polar=True)
|
606 |
+
if ax_multi == None or fig_multi == None:
|
607 |
+
fig, ax = plt.subplots(figsize=(3.5, 3.5), subplot_kw=dict(polar=True), dpi=200)
|
608 |
+
else:
|
609 |
+
fig = fig_multi
|
610 |
+
ax = ax_multi
|
611 |
+
|
612 |
+
values_norm = [v*factor for v in values_norm]
|
613 |
+
|
614 |
+
# Number of variables we're plotting.
|
615 |
+
num_vars = len(labels)
|
616 |
+
|
617 |
+
# Split the circle into even parts and save the angles
|
618 |
+
# so we know where to put each axis.
|
619 |
+
angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
|
620 |
+
|
621 |
+
# The plot is a circle, so we need to "complete the loop"
|
622 |
+
# and append the start value to the end.
|
623 |
+
values_norm += values_norm[:1]
|
624 |
+
angles += angles[:1]
|
625 |
+
|
626 |
+
# Draw the outline of our data.
|
627 |
+
ax.plot(angles, values_norm, color=color, linewidth=2)
|
628 |
+
|
629 |
+
# Fill it in.
|
630 |
+
ax.fill(angles, values_norm, color=color, alpha=0.15)
|
631 |
+
|
632 |
+
# Fix axis to go in the right order and start at 12 o'clock.
|
633 |
+
ax.set_theta_offset(np.pi / 2)
|
634 |
+
ax.set_theta_direction(-1)
|
635 |
+
|
636 |
+
# Draw axis lines for each angle and label.
|
637 |
+
labels += labels[:1]
|
638 |
+
ax.set_thetagrids(np.degrees(angles), labels)
|
639 |
+
|
640 |
+
# Go through labels and adjust alignment based on where
|
641 |
+
# it is in the circle.
|
642 |
+
for label, angle in zip(ax.get_xticklabels(), angles):
|
643 |
+
if angle in (0, np.pi):
|
644 |
+
label.set_horizontalalignment('center')
|
645 |
+
elif 0 < angle < np.pi:
|
646 |
+
label.set_horizontalalignment('left')
|
647 |
+
else:
|
648 |
+
label.set_horizontalalignment('right')
|
649 |
+
label.set_fontsize(label_font_size)
|
650 |
+
|
651 |
+
# Ensure radar goes from 0 to 100.
|
652 |
+
ax.set_ylim(0, 100)
|
653 |
+
|
654 |
+
# of the first two axes.
|
655 |
+
ax.set_rlabel_position(180 / num_vars)
|
656 |
+
|
657 |
+
# Add some custom styling.
|
658 |
+
# Change the color of the tick labels.
|
659 |
+
ax.tick_params(colors='#222222')
|
660 |
+
|
661 |
+
# Make the y-axis (0-100) labels smaller.
|
662 |
+
ax.tick_params(axis='y', labelsize=6)
|
663 |
+
# Change the color of the circular gridlines.
|
664 |
+
ax.grid(color='#AAAAAA')
|
665 |
+
# Change the color of the outermost gridline (the spine).
|
666 |
+
ax.spines['polar'].set_color('#222222')
|
667 |
+
# Change the background color inside the circle itself.
|
668 |
+
ax.set_facecolor('#FAFAFA')
|
669 |
+
|
670 |
+
# Lastly, give the chart a title and give it some
|
671 |
+
# padding above the "Acceleration" label.
|
672 |
+
ax.set_title(cluster_name, y=1.11)
|
673 |
+
|
674 |
+
# Add this at the end of your function
|
675 |
+
if num_datapoints is not None:
|
676 |
+
# plt.figtext adds text to the figure as a whole, outside individual subplots
|
677 |
+
# The parameters are (x, y, text), where x and y are in figure coordinates
|
678 |
+
plt.figtext(0.5, -0.05, f'datapoints: {num_datapoints}', ha='center')
|
679 |
+
|
680 |
+
return fig, ax
|
681 |
+
|
682 |
+
|
683 |
+
def gh_color_blueRed():
|
684 |
+
# grasshoper color scheme
|
685 |
+
color_list = [[15,16,115],
|
686 |
+
[177,198,242],
|
687 |
+
[251,244,121],
|
688 |
+
[222,140,61],
|
689 |
+
[183,60,34]]
|
690 |
+
# Scale RGB values to [0,1] range
|
691 |
+
color_list = [[c/255. for c in color] for color in color_list]
|
692 |
+
return color_list
|
693 |
+
|
694 |
+
|
695 |
+
def linear_regression_with_residuals(
|
696 |
+
df, x_name, y_name, buffer=5, data_range_max=None, max_residual_color=None, rescale_range=None, generateName=False
|
697 |
+
):
|
698 |
+
|
699 |
+
"""
|
700 |
+
Generate a scatter plot with linear regression, residuals, and a color-coded line of equality.
|
701 |
+
|
702 |
+
Parameters:
|
703 |
+
df (DataFrame): The DataFrame containing the data.
|
704 |
+
x_name (str): The name of the x-axis variable.
|
705 |
+
y_name (str): The name of the y-axis variable.
|
706 |
+
buffer (int, optional): Buffer as a percentage of data range for plot margins. Default is 5.
|
707 |
+
data_range_max (float, optional): Maximum value for x and y axes. Default is None (auto-calculated).
|
708 |
+
max_residual_color (float, optional): Maximum residual value for color normalization. Default is None (auto-calculated).
|
709 |
+
rescale_range (tuple, optional): Rescale both x and y to the specified range. Default is None (no rescaling).
|
710 |
+
save_png (str, optional): File path to save the plot as a PNG image. Default is None (no saving).
|
711 |
+
date_source (str, optional): Date source identifier for the filename. Default is None.
|
712 |
+
|
713 |
+
Returns:
|
714 |
+
plt: Matplotlib figure for the generated plot.
|
715 |
+
"""
|
716 |
+
|
717 |
+
# Extract x and y values from the DataFrame
|
718 |
+
x = df[x_name].values
|
719 |
+
y = df[y_name].values
|
720 |
+
|
721 |
+
# Rescale x and y if rescale_range is provided
|
722 |
+
if rescale_range:
|
723 |
+
x_min, x_max = rescale_range
|
724 |
+
x = (x - min(x)) / (max(x) - min(x)) * (x_max - x_min) + x_min
|
725 |
+
y = (y - min(y)) / (max(y) - min(y)) * (x_max - x_min) + x_min
|
726 |
+
|
727 |
+
# Calculate R2 score
|
728 |
+
r2 = r2_score(x, y)
|
729 |
+
print(f"R2 Score: {r2}")
|
730 |
+
|
731 |
+
# Calculate residuals in relation to the 45-degree line
|
732 |
+
residuals_45 = y - x.flatten()
|
733 |
+
|
734 |
+
# Calculate the data range with a buffer
|
735 |
+
if data_range_max:
|
736 |
+
data_min = 0
|
737 |
+
data_max = data_range_max
|
738 |
+
else:
|
739 |
+
data_min = min(min(x), min(y))
|
740 |
+
data_max = max(max(x), max(y))
|
741 |
+
buffer_value = (data_max - data_min) * (buffer / 100)
|
742 |
+
|
743 |
+
# Create a square plot with the same range for both axes
|
744 |
+
plt.figure()
|
745 |
+
colormap = 'bwr' # Choose a colormap
|
746 |
+
cmap = plt.get_cmap(colormap)
|
747 |
+
plt.rcParams['font.family'] = 'DejaVu Sans'
|
748 |
+
|
749 |
+
# Shift the midpoint of the colormap to zero
|
750 |
+
if max_residual_color is None:
|
751 |
+
max_residual_color = max(abs(residuals_45))
|
752 |
+
norm = plt.Normalize(-max_residual_color, max_residual_color)
|
753 |
+
|
754 |
+
colors = np.array(cmap(norm(residuals_45)), dtype=object)
|
755 |
+
|
756 |
+
# Darken the edge color by making it 90% darker than the fill color
|
757 |
+
edge_colors = [tuple(0.9 * np.array(c)) for c in colors]
|
758 |
+
|
759 |
+
# Add a contour to scatter points with the same color as the point fill
|
760 |
+
scatter = plt.scatter(x, y, c=colors, label='True values', edgecolors=edge_colors, linewidths=2, zorder=3)
|
761 |
+
|
762 |
+
# Plot the line of equality (x == y)
|
763 |
+
combined_line = plt.plot([data_min - buffer_value, data_max + buffer_value], [data_min - buffer_value, data_max + buffer_value],
|
764 |
+
color='black', linewidth=1, zorder=5)
|
765 |
+
|
766 |
+
# Calculate and plot residuals in relation to the line of equality
|
767 |
+
for i in range(len(x)):
|
768 |
+
plt.plot([x[i], x[i]], [y[i], x[i]], color='gray', linestyle='--', linewidth=0.5, zorder=1)
|
769 |
+
|
770 |
+
# Plot the linear regression line
|
771 |
+
m, b = np.polyfit(x, y, 1)
|
772 |
+
regression_line = plt.plot(x, m * x + b, color='grey', linestyle='dotted', linewidth=1, label='Linear Regression line', zorder=4)
|
773 |
+
|
774 |
+
# Calculate the R2 score text position
|
775 |
+
text_x = data_min + 0.01 * (data_max - data_min)
|
776 |
+
text_y = data_max - 0.01 * (data_max - data_min)
|
777 |
+
|
778 |
+
# Annotate the plot with the R2 score
|
779 |
+
plt.text(text_x, text_y, f'$R^2$ Score: {r2:.2f}', fontsize=8, color='black')
|
780 |
+
|
781 |
+
# Add colorbar for residuals (smaller and within the plot)
|
782 |
+
sm = plt.cm.ScalarMappable(cmap=colormap, norm=norm)
|
783 |
+
sm.set_array([])
|
784 |
+
cbar = plt.colorbar(sm, ax=plt.gca(), shrink=0.2, aspect=15, pad=0.03)
|
785 |
+
cbar.set_label('Residuals (line of Equality)', fontsize=8)
|
786 |
+
|
787 |
+
# Create separate legend handles and labels
|
788 |
+
legend_handles = [scatter, regression_line[0], combined_line[0]]
|
789 |
+
legend_labels = ['True values', 'Linear Regression line', 'Line of Equality']
|
790 |
+
|
791 |
+
# Create the combined legend
|
792 |
+
combined_legend = plt.legend(handles=legend_handles, labels=legend_labels, loc='lower right', fontsize=8)
|
793 |
+
|
794 |
+
# Set the same limits for both x and y axes with a buffer
|
795 |
+
plt.xlim(data_min - buffer_value, data_max + buffer_value)
|
796 |
+
plt.ylim(data_min - buffer_value, data_max + buffer_value)
|
797 |
+
|
798 |
+
plt.gca().add_artist(combined_legend) # Add the combined legend to the plot
|
799 |
+
|
800 |
+
plt.title('Linear Regression Visualization with Residuals (line of Equality)')
|
801 |
+
plt.xlabel(" ".join(x_name.split("+"))[0].capitalize() + " ".join(x_name.split("+"))[1:])
|
802 |
+
plt.ylabel(" ".join(y_name.split("+"))[0].capitalize() + " ".join(y_name.split("+"))[1:])
|
803 |
+
|
804 |
+
# Add very light grey background grid lines
|
805 |
+
plt.grid(True, color='lightgrey', linestyle='--', alpha=0.6, zorder=0)
|
806 |
+
|
807 |
+
|
808 |
+
if generateName:
|
809 |
+
# Plot name
|
810 |
+
plt_name = "linearRegr_" + "".join(word.capitalize() for word in x_name.split("+")) + "_vs_" + "".join(
|
811 |
+
word.capitalize() for word in y_name.split("+"))
|
812 |
+
return plt, plt_name
|
813 |
+
else:
|
814 |
+
return plt
|
speckleUtils/speckle_utils.py
ADDED
@@ -0,0 +1,696 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#speckle utils
|
2 |
+
import json
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
import specklepy
|
6 |
+
from specklepy.api.client import SpeckleClient
|
7 |
+
from specklepy.api.credentials import get_default_account, get_local_accounts
|
8 |
+
from specklepy.transports.server import ServerTransport
|
9 |
+
from specklepy.api import operations
|
10 |
+
from specklepy.objects.geometry import Polyline, Point, Mesh
|
11 |
+
|
12 |
+
from specklepy.api.wrapper import StreamWrapper
|
13 |
+
try:
|
14 |
+
import openai
|
15 |
+
except:
|
16 |
+
pass
|
17 |
+
|
18 |
+
import requests
|
19 |
+
from datetime import datetime
|
20 |
+
import copy
|
21 |
+
|
22 |
+
|
23 |
+
# HELP FUNCTION ===============================================================
|
24 |
+
def helper():
|
25 |
+
"""
|
26 |
+
Prints out the help message for this module.
|
27 |
+
"""
|
28 |
+
print("This module contains a set of utility functions for speckle streams.")
|
29 |
+
print("______________________________________________________________________")
|
30 |
+
print("It requires the specklepy package to be installed -> !pip install specklepy")
|
31 |
+
print("the following functions are available:")
|
32 |
+
print("getSpeckleStream(stream_id, branch_name, client)")
|
33 |
+
print("getSpeckleGlobals(stream_id, client)")
|
34 |
+
print("get_dataframe(objects_raw, return_original_df)")
|
35 |
+
print("updateStreamAnalysis(stream_id, new_data, branch_name, geometryGroupPath, match_by_id, openai_key, return_original)")
|
36 |
+
print("there are some more function available not documented fully yet, including updating a notion database")
|
37 |
+
print("______________________________________________________________________")
|
38 |
+
print("for detailed help call >>> help(speckle_utils.function_name) <<< ")
|
39 |
+
print("______________________________________________________________________")
|
40 |
+
print("standard usage:")
|
41 |
+
print("______________________________________________________________________")
|
42 |
+
print("retreiving data")
|
43 |
+
print("1. import speckle_utils & speckle related libaries from specklepy")
|
44 |
+
print("2. create a speckle client -> client = SpeckleClient(host='https://speckle.xyz/')" )
|
45 |
+
print(" client.authenticate_with_token(token='your_token_here')")
|
46 |
+
print("3. get a speckle stream -> stream = speckle_utils.getSpeckleStream(stream_id, branch_name, client)")
|
47 |
+
print("4. get the stream data -> data = stream['pth']['to']['data']")
|
48 |
+
print("5. transform data to dataframe -> df = speckle_utils.get_dataframe(data, return_original_df=False)")
|
49 |
+
print("______________________________________________________________________")
|
50 |
+
print("updating data")
|
51 |
+
print("1. call updateStreamAnalysis --> updateStreamAnalysis(new_data, stream_id, branch_name, geometryGroupPath, match_by_id, openai_key, return_original)")
|
52 |
+
|
53 |
+
|
54 |
+
#==============================================================================
|
55 |
+
|
56 |
+
def updateSpeckleStream(stream_id,
|
57 |
+
branch_name,
|
58 |
+
client,
|
59 |
+
data_object,
|
60 |
+
commit_message="Updated the data object",
|
61 |
+
):
|
62 |
+
"""
|
63 |
+
Updates a speckle stream with a new data object.
|
64 |
+
|
65 |
+
Args:
|
66 |
+
stream_id (str): The ID of the speckle stream.
|
67 |
+
branch_name (str): The name of the branch within the speckle stream.
|
68 |
+
client (specklepy.api.client.Client): A speckle client.
|
69 |
+
data_object (dict): The data object to send to the speckle stream.
|
70 |
+
commit_message (str): The commit message. Defaults to "Updated the data object".
|
71 |
+
"""
|
72 |
+
# set stream and branch
|
73 |
+
branch = client.branch.get(stream_id, branch_name)
|
74 |
+
# Get transport
|
75 |
+
transport = ServerTransport(client=client, stream_id=stream_id)
|
76 |
+
# Send the data object to the speckle stream
|
77 |
+
object_id = operations.send(data_object, [transport])
|
78 |
+
|
79 |
+
# Create a new commit with the new object
|
80 |
+
commit_id = client.commit.create(
|
81 |
+
stream_id,
|
82 |
+
object_id= object_id,
|
83 |
+
message=commit_message,
|
84 |
+
branch_name=branch_name,
|
85 |
+
)
|
86 |
+
|
87 |
+
return commit_id
|
88 |
+
def getSpeckleStream(stream_id,
|
89 |
+
branch_name,
|
90 |
+
client,
|
91 |
+
commit_id=""
|
92 |
+
):
|
93 |
+
"""
|
94 |
+
Retrieves data from a specific branch of a speckle stream.
|
95 |
+
|
96 |
+
Args:
|
97 |
+
stream_id (str): The ID of the speckle stream.
|
98 |
+
branch_name (str): The name of the branch within the speckle stream.
|
99 |
+
client (specklepy.api.client.Client, optional): A speckle client. Defaults to a global `client`.
|
100 |
+
commit_id (str): id of a commit, if nothing is specified, the latest commit will be fetched
|
101 |
+
|
102 |
+
Returns:
|
103 |
+
dict: The speckle stream data received from the specified branch.
|
104 |
+
|
105 |
+
This function retrieves the last commit from a specific branch of a speckle stream.
|
106 |
+
It uses the provided speckle client to get the branch and commit information, and then
|
107 |
+
retrieves the speckle stream data associated with the last commit.
|
108 |
+
It prints out the branch details and the creation dates of the last three commits for debugging purposes.
|
109 |
+
"""
|
110 |
+
|
111 |
+
print("updated A")
|
112 |
+
|
113 |
+
# set stream and branch
|
114 |
+
try:
|
115 |
+
branch = client.branch.get(stream_id, branch_name, 3)
|
116 |
+
print(branch)
|
117 |
+
except:
|
118 |
+
branch = client.branch.get(stream_id, branch_name, 1)
|
119 |
+
print(branch)
|
120 |
+
|
121 |
+
print("last three commits:")
|
122 |
+
[print(ite.createdAt) for ite in branch.commits.items]
|
123 |
+
|
124 |
+
if commit_id == "":
|
125 |
+
latest_commit = branch.commits.items[0]
|
126 |
+
choosen_commit_id = latest_commit.id
|
127 |
+
commit = client.commit.get(stream_id, choosen_commit_id)
|
128 |
+
print("latest commit ", branch.commits.items[0].createdAt, " was choosen")
|
129 |
+
elif type(commit_id) == type("s"): # string, commit uuid
|
130 |
+
choosen_commit_id = commit_id
|
131 |
+
commit = client.commit.get(stream_id, choosen_commit_id)
|
132 |
+
print("provided commit ", choosen_commit_id, " was choosen")
|
133 |
+
elif type(commit_id) == type(1): #int
|
134 |
+
latest_commit = branch.commits.items[commit_id]
|
135 |
+
choosen_commit_id = latest_commit.id
|
136 |
+
commit = client.commit.get(stream_id, choosen_commit_id)
|
137 |
+
|
138 |
+
|
139 |
+
print(commit)
|
140 |
+
print(commit.referencedObject)
|
141 |
+
# get transport
|
142 |
+
transport = ServerTransport(client=client, stream_id=stream_id)
|
143 |
+
#speckle stream
|
144 |
+
res = operations.receive(commit.referencedObject, transport)
|
145 |
+
|
146 |
+
return res
|
147 |
+
|
148 |
+
def getSpeckleGlobals(stream_id, client):
|
149 |
+
"""
|
150 |
+
Retrieves global analysis information from the "globals" branch of a speckle stream.
|
151 |
+
|
152 |
+
Args:
|
153 |
+
stream_id (str): The ID of the speckle stream.
|
154 |
+
client (specklepy.api.client.Client, optional): A speckle client. Defaults to a global `client`.
|
155 |
+
|
156 |
+
Returns:
|
157 |
+
analysisInfo (dict or None): The analysis information retrieved from globals. None if no globals found.
|
158 |
+
analysisGroups (list or None): The analysis groups retrieved from globals. None if no globals found.
|
159 |
+
|
160 |
+
This function attempts to retrieve and parse the analysis information from the "globals"
|
161 |
+
branch of the specified speckle stream. It accesses and parses the "analysisInfo" and "analysisGroups"
|
162 |
+
global attributes, extracts analysis names and UUIDs.
|
163 |
+
If no globals are found in the speckle stream, it returns None for both analysisInfo and analysisGroups.
|
164 |
+
"""
|
165 |
+
# get the latest commit
|
166 |
+
try:
|
167 |
+
# speckle stream globals
|
168 |
+
branchGlob = client.branch.get(stream_id, "globals")
|
169 |
+
latest_commit_Glob = branchGlob.commits.items[0]
|
170 |
+
transport = ServerTransport(client=client, stream_id=stream_id)
|
171 |
+
|
172 |
+
globs = operations.receive(latest_commit_Glob.referencedObject, transport)
|
173 |
+
|
174 |
+
# access and parse globals
|
175 |
+
#analysisInfo = json.loads(globs["analysisInfo"]["@{0;0;0;0}"][0].replace("'", '"'))
|
176 |
+
#analysisGroups = [json.loads(gr.replace("'", '"')) for gr in globs["analysisGroups"]["@{0}"]]
|
177 |
+
|
178 |
+
def get_error_context(e, context=100):
|
179 |
+
start = max(0, e.pos - context)
|
180 |
+
end = e.pos + context
|
181 |
+
error_line = e.doc[start:end]
|
182 |
+
pointer_line = ' ' * (e.pos - start - 1) + '^'
|
183 |
+
return error_line, pointer_line
|
184 |
+
|
185 |
+
try:
|
186 |
+
analysisInfo = json.loads(globs["analysisInfo"]["@{0;0;0;0}"][0].replace("'", '"').replace("None", "null"))
|
187 |
+
except json.JSONDecodeError as e:
|
188 |
+
print(f"Error decoding analysisInfo: {e}")
|
189 |
+
error_line, pointer_line = get_error_context(e)
|
190 |
+
print("Error position and surrounding text:")
|
191 |
+
print(error_line)
|
192 |
+
print(pointer_line)
|
193 |
+
analysisInfo = None
|
194 |
+
|
195 |
+
try:
|
196 |
+
analysisGroups = [json.loads(gr.replace("'", '"').replace("None", "null")) for gr in globs["analysisGroups"]["@{0}"]]
|
197 |
+
except json.JSONDecodeError as e:
|
198 |
+
print(f"Error decoding analysisGroups: {e}")
|
199 |
+
error_line, pointer_line = get_error_context(e)
|
200 |
+
print("Error position and surrounding text:")
|
201 |
+
print(error_line)
|
202 |
+
print(pointer_line)
|
203 |
+
analysisGroups = None
|
204 |
+
|
205 |
+
|
206 |
+
|
207 |
+
# extract analysis names
|
208 |
+
analysis_names = []
|
209 |
+
analysis_uuid = []
|
210 |
+
[(analysis_names.append(key.split("++")[0]),analysis_uuid.append(key.split("++")[1]) ) for key in analysisInfo.keys()]
|
211 |
+
|
212 |
+
|
213 |
+
# print extracted results
|
214 |
+
print("there are global dictionaries with additional information for each analysis")
|
215 |
+
print("<analysisGroups> -> ", [list(curgrp.keys()) for curgrp in analysisGroups])
|
216 |
+
print("<analysis_names> -> ", analysis_names)
|
217 |
+
print("<analysis_uuid> -> ", analysis_uuid)
|
218 |
+
except Exception as e: # catch exception as 'e'
|
219 |
+
analysisInfo = None
|
220 |
+
analysisGroups = None
|
221 |
+
print("No GlOBALS FOUND")
|
222 |
+
print(f"Error: {e}") # print error description
|
223 |
+
|
224 |
+
return analysisInfo, analysisGroups
|
225 |
+
|
226 |
+
|
227 |
+
|
228 |
+
#function to extract non geometry data from speckle
|
229 |
+
def get_dataframe(objects_raw, return_original_df=False):
|
230 |
+
"""
|
231 |
+
Creates a pandas DataFrame from a list of raw Speckle objects.
|
232 |
+
|
233 |
+
Args:
|
234 |
+
objects_raw (list): List of raw Speckle objects.
|
235 |
+
return_original_df (bool, optional): If True, the function also returns the original DataFrame before any conversion to numeric. Defaults to False.
|
236 |
+
|
237 |
+
Returns:
|
238 |
+
pd.DataFrame or tuple: If return_original_df is False, returns a DataFrame where all numeric columns have been converted to their respective types,
|
239 |
+
and non-numeric columns are left unchanged.
|
240 |
+
If return_original_df is True, returns a tuple where the first item is the converted DataFrame,
|
241 |
+
and the second item is the original DataFrame before conversion.
|
242 |
+
|
243 |
+
This function iterates over the raw Speckle objects, creating a dictionary for each object that excludes the '@Geometry' attribute.
|
244 |
+
These dictionaries are then used to create a pandas DataFrame.
|
245 |
+
The function attempts to convert each column to a numeric type if possible, and leaves it unchanged if not.
|
246 |
+
Non-convertible values in numeric columns are replaced with their original values.
|
247 |
+
"""
|
248 |
+
# dataFrame
|
249 |
+
df_data = []
|
250 |
+
# Iterate over speckle objects
|
251 |
+
for obj_raw in objects_raw:
|
252 |
+
obj = obj_raw.__dict__
|
253 |
+
df_obj = {k: v for k, v in obj.items() if k != '@Geometry'}
|
254 |
+
df_data.append(df_obj)
|
255 |
+
|
256 |
+
# Create DataFrame and GeoDataFrame
|
257 |
+
df = pd.DataFrame(df_data)
|
258 |
+
# Convert columns to float or int if possible, preserving non-convertible values <-
|
259 |
+
df_copy = df.copy()
|
260 |
+
for col in df.columns:
|
261 |
+
df[col] = pd.to_numeric(df[col], errors='coerce')
|
262 |
+
df[col].fillna(df_copy[col], inplace=True)
|
263 |
+
|
264 |
+
if return_original_df:
|
265 |
+
return df, df_copy
|
266 |
+
else:
|
267 |
+
return df
|
268 |
+
|
269 |
+
|
270 |
+
def updateStreamAnalysis(
|
271 |
+
client,
|
272 |
+
new_data,
|
273 |
+
stream_id,
|
274 |
+
branch_name,
|
275 |
+
geometryGroupPath=None,
|
276 |
+
match_by_id="",
|
277 |
+
openai_key ="",
|
278 |
+
return_original = False
|
279 |
+
):
|
280 |
+
|
281 |
+
|
282 |
+
"""
|
283 |
+
Updates Stream Analysis by modifying object attributes based on new data.
|
284 |
+
|
285 |
+
Args:
|
286 |
+
new_data (pandas.DataFrame): DataFrame containing new data.
|
287 |
+
stream_id (str): Stream ID.
|
288 |
+
branch_name (str): Branch name.
|
289 |
+
geometry_group_path (list, optional): Path to geometry group. Defaults to ["@Data", "@{0}"].
|
290 |
+
match_by_id (str, optional): key for column that should be used for matching. If empty, the index is used.
|
291 |
+
openai_key (str, optional): OpenAI key. If empty no AI commit message is generated Defaults to an empty string.
|
292 |
+
return_original (bool, optional): Determines whether to return original speckle stream objects. Defaults to False.
|
293 |
+
|
294 |
+
Returns:
|
295 |
+
list: original speckle stream objects as backup if return_original is set to True.
|
296 |
+
|
297 |
+
This function retrieves the latest commit from a specified branch, obtains the
|
298 |
+
necessary geometry objects, and matches new data with existing objects using
|
299 |
+
an ID mapper. The OpenAI GPT model is optionally used to create a commit summary
|
300 |
+
message. Changes are sent back to the server and a new commit is created, with
|
301 |
+
the original objects returned as a backup if return_original is set to True.
|
302 |
+
The script requires active server connection, necessary permissions, and relies
|
303 |
+
on Speckle and OpenAI's GPT model libraries.
|
304 |
+
"""
|
305 |
+
print("1")
|
306 |
+
if geometryGroupPath == None:
|
307 |
+
geometryGroupPath = ["@Speckle", "Geometry"]
|
308 |
+
|
309 |
+
branch = client.branch.get(stream_id, branch_name, 2)
|
310 |
+
|
311 |
+
latest_commit = branch.commits.items[0]
|
312 |
+
commitID = latest_commit.id
|
313 |
+
|
314 |
+
commit = client.commit.get(stream_id, commitID)
|
315 |
+
|
316 |
+
# get objects
|
317 |
+
transport = ServerTransport(client=client, stream_id=stream_id)
|
318 |
+
|
319 |
+
#speckle stream
|
320 |
+
res = operations.receive(commit.referencedObject, transport)
|
321 |
+
|
322 |
+
# get geometry objects (they carry the attributes)
|
323 |
+
objects_raw = res[geometryGroupPath[0]][geometryGroupPath[1]]
|
324 |
+
res_new = copy.deepcopy(res)
|
325 |
+
print("2")
|
326 |
+
# map ids
|
327 |
+
id_mapper = {}
|
328 |
+
if match_by_id != "":
|
329 |
+
for i, obj in enumerate(objects_raw):
|
330 |
+
id_mapper[obj[match_by_id]] = i
|
331 |
+
else:
|
332 |
+
for i, obj in enumerate(objects_raw):
|
333 |
+
id_mapper[str(i)] = i
|
334 |
+
print("3")
|
335 |
+
# iterate through rows (objects)
|
336 |
+
for index, row in new_data.iterrows():
|
337 |
+
#determin target object
|
338 |
+
if match_by_id != "":
|
339 |
+
local_id = row[match_by_id]
|
340 |
+
else:
|
341 |
+
local_id = index
|
342 |
+
target_id = id_mapper[local_id]
|
343 |
+
|
344 |
+
#iterate through columns (attributes)
|
345 |
+
for col_name in new_data.columns:
|
346 |
+
res_new[geometryGroupPath[0]][geometryGroupPath[1]][target_id][col_name] = row[col_name]
|
347 |
+
|
348 |
+
print("4")
|
349 |
+
# ======================== OPEN AI FUN ===========================
|
350 |
+
"""
|
351 |
+
try:
|
352 |
+
try:
|
353 |
+
answer_summary = gptCommitMessage(objects_raw, new_data,openai_key)
|
354 |
+
if answer_summary == None:
|
355 |
+
_, answer_summary = compareStats(get_dataframe(objects_raw),new_data)
|
356 |
+
except:
|
357 |
+
_, answer_summary = compareStats(get_dataframe(objects_raw),new_data)
|
358 |
+
except:
|
359 |
+
answer_summary = ""
|
360 |
+
"""
|
361 |
+
answer_summary = ""
|
362 |
+
# ================================================================
|
363 |
+
print("5")
|
364 |
+
new_objects_raw_speckle_id = operations.send(base=res_new, transports=[transport])
|
365 |
+
print("6")
|
366 |
+
# You can now create a commit on your stream with this object
|
367 |
+
commit_id = client.commit.create(
|
368 |
+
stream_id=stream_id,
|
369 |
+
branch_name=branch_name,
|
370 |
+
object_id=new_objects_raw_speckle_id,
|
371 |
+
message="Updated item in colab -" + answer_summary,
|
372 |
+
)
|
373 |
+
print("7")
|
374 |
+
print("Commit created!")
|
375 |
+
if return_original:
|
376 |
+
return objects_raw #as back-up
|
377 |
+
|
378 |
+
def custom_describe(df):
|
379 |
+
# Convert columns to numeric if possible
|
380 |
+
df = df.apply(lambda x: pd.to_numeric(x, errors='ignore'))
|
381 |
+
|
382 |
+
# Initial describe with 'include = all'
|
383 |
+
desc = df.describe(include='all')
|
384 |
+
|
385 |
+
# Desired statistics
|
386 |
+
desired_stats = ['count', 'unique', 'mean', 'min', 'max']
|
387 |
+
|
388 |
+
# Filter for desired statistics
|
389 |
+
result = desc.loc[desired_stats, :].copy()
|
390 |
+
return result
|
391 |
+
|
392 |
+
def compareStats(df_before, df_after):
|
393 |
+
"""
|
394 |
+
Compares the descriptive statistics of two pandas DataFrames before and after some operations.
|
395 |
+
|
396 |
+
Args:
|
397 |
+
df_before (pd.DataFrame): DataFrame representing the state of data before operations.
|
398 |
+
df_after (pd.DataFrame): DataFrame representing the state of data after operations.
|
399 |
+
|
400 |
+
Returns:
|
401 |
+
The CSV string includes column name, intervention type, and before and after statistics for each column.
|
402 |
+
The summary string provides a count of updated and new columns.
|
403 |
+
|
404 |
+
This function compares the descriptive statistics of two DataFrames: 'df_before' and 'df_after'.
|
405 |
+
It checks the columns in both DataFrames and categorizes them as either 'updated' or 'new'.
|
406 |
+
The 'updated' columns exist in both DataFrames while the 'new' columns exist only in 'df_after'.
|
407 |
+
For 'updated' columns, it compares the statistics before and after and notes the differences.
|
408 |
+
For 'new' columns, it lists the 'after' statistics and marks the 'before' statistics as 'NA'.
|
409 |
+
The function provides a summary with the number of updated and new columns,
|
410 |
+
and a detailed account in CSV format of changes in column statistics.
|
411 |
+
"""
|
412 |
+
|
413 |
+
desc_before = custom_describe(df_before)
|
414 |
+
desc_after = custom_describe(df_after)
|
415 |
+
|
416 |
+
# Get union of all columns
|
417 |
+
all_columns = set(desc_before.columns).union(set(desc_after.columns))
|
418 |
+
|
419 |
+
# Track number of updated and new columns
|
420 |
+
updated_cols = 0
|
421 |
+
new_cols = 0
|
422 |
+
|
423 |
+
# Prepare DataFrame output
|
424 |
+
output_data = []
|
425 |
+
|
426 |
+
for column in all_columns:
|
427 |
+
row_data = {'column': column}
|
428 |
+
stat_diff = False # Track if there's a difference in stats for a column
|
429 |
+
|
430 |
+
# Check if column exists in both dataframes
|
431 |
+
if column in desc_before.columns and column in desc_after.columns:
|
432 |
+
updated_cols += 1
|
433 |
+
row_data['interventionType'] = 'updated'
|
434 |
+
for stat in desc_before.index:
|
435 |
+
before_val = round(desc_before.loc[stat, column], 1) if pd.api.types.is_number(desc_before.loc[stat, column]) else desc_before.loc[stat, column]
|
436 |
+
after_val = round(desc_after.loc[stat, column], 1) if pd.api.types.is_number(desc_after.loc[stat, column]) else desc_after.loc[stat, column]
|
437 |
+
if before_val != after_val:
|
438 |
+
stat_diff = True
|
439 |
+
row_data[stat+'_before'] = before_val
|
440 |
+
row_data[stat+'_after'] = after_val
|
441 |
+
elif column in desc_after.columns:
|
442 |
+
new_cols += 1
|
443 |
+
stat_diff = True
|
444 |
+
row_data['interventionType'] = 'new'
|
445 |
+
for stat in desc_after.index:
|
446 |
+
row_data[stat+'_before'] = 'NA'
|
447 |
+
after_val = round(desc_after.loc[stat, column], 1) if pd.api.types.is_number(desc_after.loc[stat, column]) else desc_after.loc[stat, column]
|
448 |
+
row_data[stat+'_after'] = after_val
|
449 |
+
|
450 |
+
# Only add to output_data if there's actually a difference in the descriptive stats between "before" and "after".
|
451 |
+
if stat_diff:
|
452 |
+
output_data.append(row_data)
|
453 |
+
|
454 |
+
output_df = pd.DataFrame(output_data)
|
455 |
+
csv_output = output_df.to_csv(index=False)
|
456 |
+
print (output_df)
|
457 |
+
# Add summary to beginning of output
|
458 |
+
summary = f"Summary:\n Number of updated columns: {updated_cols}\n Number of new columns: {new_cols}\n\n"
|
459 |
+
csv_output = summary + csv_output
|
460 |
+
|
461 |
+
return csv_output, summary
|
462 |
+
|
463 |
+
|
464 |
+
|
465 |
+
# Function to call ChatGPT API
|
466 |
+
def ask_chatgpt(prompt, model="gpt-3.5-turbo", max_tokens=300, n=1, stop=None, temperature=0.3):
|
467 |
+
import openai
|
468 |
+
response = openai.ChatCompletion.create(
|
469 |
+
model=model,
|
470 |
+
messages=[
|
471 |
+
{"role": "system", "content": "You are a helpfull assistant,."},
|
472 |
+
{"role": "user", "content": prompt}
|
473 |
+
],
|
474 |
+
max_tokens=max_tokens,
|
475 |
+
n=n,
|
476 |
+
stop=stop,
|
477 |
+
temperature=temperature,
|
478 |
+
)
|
479 |
+
return response.choices[0].message['content']
|
480 |
+
|
481 |
+
|
482 |
+
|
483 |
+
|
484 |
+
def gptCommitMessage(objects_raw, new_data,openai_key):
|
485 |
+
# the idea is to automatically create commit messages. Commits coming through this channel are all
|
486 |
+
# about updating or adding a dataTable. So we can compare the descriptive stats of a before and after
|
487 |
+
# data frame
|
488 |
+
#try:
|
489 |
+
try:
|
490 |
+
import openai
|
491 |
+
openai.api_key = openai_key
|
492 |
+
except NameError as ne:
|
493 |
+
if str(ne) == "name 'openai' is not defined":
|
494 |
+
print("No auto commit message: openai module not imported. Please import the module before setting the API key.")
|
495 |
+
elif str(ne) == "name 'openai_key' is not defined":
|
496 |
+
print("No auto commit message: openai_key is not defined. Please define the variable before setting the API key.")
|
497 |
+
else:
|
498 |
+
raise ne
|
499 |
+
|
500 |
+
report, summary = compareStats(get_dataframe(objects_raw),new_data)
|
501 |
+
|
502 |
+
# prompt
|
503 |
+
prompt = f"""Given the following changes in my tabular data structure, generate a
|
504 |
+
precise and informative commit message. The changes involve updating or adding
|
505 |
+
attribute keys and values. The provided summary statistics detail the changes in
|
506 |
+
the data from 'before' to 'after'.
|
507 |
+
The CSV format below demonstrates the structure of the summary:
|
508 |
+
|
509 |
+
Summary:
|
510 |
+
Number of updated columns: 2
|
511 |
+
Number of new columns: 1
|
512 |
+
column,interventionType,count_before,count_after,unique_before,unique_after,mean_before,mean_after,min_before,min_after,max_before,max_after
|
513 |
+
A,updated,800,800,2,3,,nan,nan,nan,nan,nan
|
514 |
+
B,updated,800,800,3,3,,nan,nan,nan,nan,nan
|
515 |
+
C,new,NA,800,NA,4,NA,nan,NA,nan,NA,nan
|
516 |
+
|
517 |
+
For the commit message, your focus should be on changes in the data structure, not the interpretation of the content. Be precise, state the facts, and highlight significant differences or trends in the statistics, such as shifts in mean values or an increase in unique entries.
|
518 |
+
|
519 |
+
Based on the above guidance, draft a commit message using the following actual summary statistics:
|
520 |
+
|
521 |
+
{report}
|
522 |
+
|
523 |
+
Your commit message should follow this structure:
|
524 |
+
|
525 |
+
1. Brief description of the overall changes.
|
526 |
+
2. Significant changes in summary statistics (count, unique, mean, min, max).
|
527 |
+
3. Conclusion, summarizing the most important findings with the strucutre:
|
528 |
+
# changed columns: , comment: ,
|
529 |
+
# added Columns: , comment: ,
|
530 |
+
# Chaged statistic: , coment: ,
|
531 |
+
|
532 |
+
Mark the beginning of the conclusion with ">>>" and ensure to emphasize hard facts and significant findings.
|
533 |
+
"""
|
534 |
+
|
535 |
+
try:
|
536 |
+
answer = ask_chatgpt(prompt)
|
537 |
+
answer_summery = answer.split(">>>")[1]
|
538 |
+
if answer == None:
|
539 |
+
answer_summery = summary
|
540 |
+
except:
|
541 |
+
answer_summery = summary
|
542 |
+
|
543 |
+
print(answer_summery)
|
544 |
+
return answer_summery
|
545 |
+
|
546 |
+
def specklePolyline_to_BokehPatches(speckle_objs, pth_to_geo="curves", id_key="ids"):
|
547 |
+
"""
|
548 |
+
Takes a list of speckle objects, extracts the polyline geometry at the specified path, and returns a dataframe of x and y coordinates for each polyline.
|
549 |
+
This format is compatible with the Bokeh Patches object for plotting.
|
550 |
+
|
551 |
+
Args:
|
552 |
+
speckle_objs (list): A list of Speckle Objects
|
553 |
+
pth_to_geo (str): Path to the geometry in the Speckle Object
|
554 |
+
id_key (str): The key to use for the uuid in the dataframe. Defaults to "uuid"
|
555 |
+
|
556 |
+
Returns:
|
557 |
+
pd.DataFrame: A Pandas DataFrame with columns "uuid", "patches_x" and "patches_y"
|
558 |
+
"""
|
559 |
+
patchesDict = {"uuid":[], "patches_x":[], "patches_y":[]}
|
560 |
+
|
561 |
+
for obj in speckle_objs:
|
562 |
+
obj_geo = obj[pth_to_geo]
|
563 |
+
obj_pts = Polyline.as_points(obj_geo)
|
564 |
+
coorX = []
|
565 |
+
coorY = []
|
566 |
+
for pt in obj_pts:
|
567 |
+
coorX.append(pt.x)
|
568 |
+
coorY.append(pt.y)
|
569 |
+
|
570 |
+
patchesDict["patches_x"].append(coorX)
|
571 |
+
patchesDict["patches_y"].append(coorY)
|
572 |
+
patchesDict["uuid"].append(obj[id_key])
|
573 |
+
|
574 |
+
return pd.DataFrame(patchesDict)
|
575 |
+
|
576 |
+
|
577 |
+
|
578 |
+
def rebuildAnalysisInfoDict(analysisInfo):
|
579 |
+
"""rebuild the analysisInfo dictionary to remove the ++ from the keys
|
580 |
+
|
581 |
+
Args:
|
582 |
+
analysisInfo (list): a list containing the analysisInfo dictionary
|
583 |
+
|
584 |
+
Returns:
|
585 |
+
dict: a dictionary containing the analysisInfo dictionary with keys without the ++
|
586 |
+
|
587 |
+
"""
|
588 |
+
analysisInfoDict = {}
|
589 |
+
for curKey in analysisInfo[0]:
|
590 |
+
newkey = curKey.split("++")[0]
|
591 |
+
analysisInfoDict[newkey] = analysisInfo[0][curKey]
|
592 |
+
return analysisInfoDict
|
593 |
+
|
594 |
+
|
595 |
+
def specklePolyline2Patches(speckle_objs, pth_to_geo="curves", id_key=None):
|
596 |
+
"""
|
597 |
+
Converts Speckle objects' polyline information into a format suitable for Bokeh patches.
|
598 |
+
|
599 |
+
Args:
|
600 |
+
speckle_objs (list): A list of Speckle objects.
|
601 |
+
pth_to_geo (str, optional): The path to the polyline geometric information in the Speckle objects. Defaults to "curves".
|
602 |
+
id_key (str, optional): The key for object identification. Defaults to "uuid".
|
603 |
+
|
604 |
+
Returns:
|
605 |
+
DataFrame: A pandas DataFrame with three columns - "uuid", "patches_x", and "patches_y". Each row corresponds to a Speckle object.
|
606 |
+
"uuid" column contains the object's identifier.
|
607 |
+
"patches_x" and "patches_y" columns contain lists of x and y coordinates of the polyline points respectively.
|
608 |
+
|
609 |
+
This function iterates over the given Speckle objects, retrieves the polyline geometric information and the object's id from each Speckle object,
|
610 |
+
and formats this information into a format suitable for Bokeh or matplotlib patches. The formatted information is stored in a dictionary with three lists
|
611 |
+
corresponding to the "uuid", "patches_x", and "patches_y", and this dictionary is then converted into a pandas DataFrame.
|
612 |
+
"""
|
613 |
+
patchesDict = {"patches_x":[], "patches_y":[]}
|
614 |
+
if id_key != None:
|
615 |
+
patchesDict[id_key] = []
|
616 |
+
|
617 |
+
for obj in speckle_objs:
|
618 |
+
obj_geo = obj[pth_to_geo]
|
619 |
+
|
620 |
+
coorX = []
|
621 |
+
coorY = []
|
622 |
+
|
623 |
+
if isinstance(obj_geo, Mesh):
|
624 |
+
# For meshes, we'll just use the vertices for now
|
625 |
+
for pt in obj_geo.vertices:
|
626 |
+
coorX.append(pt.x)
|
627 |
+
coorY.append(pt.y)
|
628 |
+
else:
|
629 |
+
# For polylines, we'll use the existing logic
|
630 |
+
obj_pts = Polyline.as_points(obj_geo)
|
631 |
+
for pt in obj_pts:
|
632 |
+
coorX.append(pt.x)
|
633 |
+
coorY.append(pt.y)
|
634 |
+
|
635 |
+
patchesDict["patches_x"].append(coorX)
|
636 |
+
patchesDict["patches_y"].append(coorY)
|
637 |
+
if id_key != None:
|
638 |
+
patchesDict[id_key].append(obj[id_key])
|
639 |
+
|
640 |
+
return pd.DataFrame(patchesDict)
|
641 |
+
|
642 |
+
|
643 |
+
#================= NOTION INTEGRATION ============================
|
644 |
+
headers = {
|
645 |
+
"Notion-Version": "2022-06-28",
|
646 |
+
"Content-Type": "application/json"
|
647 |
+
}
|
648 |
+
|
649 |
+
def get_page_id(token, database_id, name):
|
650 |
+
headers['Authorization'] = "Bearer " + token
|
651 |
+
# Send a POST request to the Notion API
|
652 |
+
response = requests.post(f"https://api.notion.com/v1/databases/{database_id}/query", headers=headers)
|
653 |
+
|
654 |
+
# Load the response data
|
655 |
+
data = json.loads(response.text)
|
656 |
+
|
657 |
+
# Check each page in the results
|
658 |
+
for page in data['results']:
|
659 |
+
# If the name matches, return the ID
|
660 |
+
if page['properties']['name']['title'][0]['text']['content'] == name:
|
661 |
+
return page['id']
|
662 |
+
|
663 |
+
# If no match was found, return None
|
664 |
+
return None
|
665 |
+
|
666 |
+
def add_or_update_page(token, database_id, name, type, time_updated, comment, speckle_link):
|
667 |
+
# Format time_updated as a string 'YYYY-MM-DD'
|
668 |
+
date_string = time_updated.strftime('%Y-%m-%d')
|
669 |
+
|
670 |
+
# Construct the data payload
|
671 |
+
data = {
|
672 |
+
'parent': {'database_id': database_id},
|
673 |
+
'properties': {
|
674 |
+
'name': {'title': [{'text': {'content': name}}]},
|
675 |
+
'type': {'rich_text': [{'text': {'content': type}}]},
|
676 |
+
'time_updated': {'date': {'start': date_string}},
|
677 |
+
'comment': {'rich_text': [{'text': {'content': comment}}]},
|
678 |
+
'speckle_link': {'rich_text': [{'text': {'content': speckle_link}}]}
|
679 |
+
}
|
680 |
+
}
|
681 |
+
|
682 |
+
# Check if a page with this name already exists
|
683 |
+
page_id = get_page_id(token, database_id, name)
|
684 |
+
|
685 |
+
headers['Authorization'] = "Bearer " + token
|
686 |
+
if page_id:
|
687 |
+
# If the page exists, send a PATCH request to update it
|
688 |
+
response = requests.patch(f"https://api.notion.com/v1/pages/{page_id}", headers=headers, data=json.dumps(data))
|
689 |
+
else:
|
690 |
+
# If the page doesn't exist, send a POST request to create it
|
691 |
+
response = requests.post("https://api.notion.com/v1/pages", headers=headers, data=json.dumps(data))
|
692 |
+
|
693 |
+
print(response.text)
|
694 |
+
|
695 |
+
# Use the function
|
696 |
+
#add_or_update_page('your_token', 'your_database_id', 'New Title', 'New Type', datetime.now(), 'This is a comment', 'https://your-link.com')
|
tripGenerationFunc.py
CHANGED
@@ -18,13 +18,16 @@ from functools import wraps
|
|
18 |
|
19 |
|
20 |
|
21 |
-
sys.path.append("
|
22 |
|
23 |
|
24 |
-
from .
|
25 |
-
|
26 |
|
27 |
|
|
|
|
|
|
|
|
|
28 |
|
29 |
# !!! lots of hard coded values in computeTrips !!!
|
30 |
|
|
|
18 |
|
19 |
|
20 |
|
21 |
+
sys.path.append("speckleUtils")
|
22 |
|
23 |
|
24 |
+
from .speckleUtils import speckle_utils
|
|
|
25 |
|
26 |
|
27 |
+
#https://serjd-syncspeckle2notion.hf.space/webhooks/update_streams
|
28 |
+
#https://serjd-RECODE_HF_tripGeneration.hf.space/webhooks/update_streams
|
29 |
+
#serJD/RECODE_HF_tripGeneration
|
30 |
+
# https://huggingface.co/spaces/serJD/RECODE_HF_tripGeneration
|
31 |
|
32 |
# !!! lots of hard coded values in computeTrips !!!
|
33 |
|