Spaces:
Running
Running
victormiller
commited on
Commit
•
e723f50
1
Parent(s):
7cc1892
Update curated.py
Browse files- curated.py +32 -0
curated.py
CHANGED
@@ -919,6 +919,30 @@ def get_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo")
|
|
919 |
)
|
920 |
|
921 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
922 |
def update(target: str, request):
|
923 |
params = request.query_params
|
924 |
if data_source := params.get(f"data_source_{target}"):
|
@@ -1043,6 +1067,13 @@ def curated(request):
|
|
1043 |
|
1044 |
|
1045 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1046 |
preprocessing_steps = pd.DataFrame(
|
1047 |
{
|
1048 |
"Step": [
|
@@ -1127,6 +1158,7 @@ def curated(request):
|
|
1127 |
plotly2fasthtml(diff2_stacked_bar),
|
1128 |
P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
|
1129 |
filtering_process,
|
|
|
1130 |
data_preparation_div,
|
1131 |
#H2("Local Deduplication"), are these numbers even right?
|
1132 |
#local_dedup_text,
|
|
|
919 |
)
|
920 |
|
921 |
|
922 |
+
|
923 |
+
def get_freelaw_data(data_source: str = "Freelaw", doc_id: int = 3, target: str = "foo"):
|
924 |
+
doc_id = max(0, min(int(doc_id), 9))
|
925 |
+
|
926 |
+
if data_source == "Freelaw":
|
927 |
+
raw_sample_doc = json.load(open("data/curated_samples/freelaw_raw.json"))
|
928 |
+
extracted_sample_doc = json.load(
|
929 |
+
open("data/curated_samples/freelaw_extract.json")
|
930 |
+
)
|
931 |
+
else:
|
932 |
+
raw_sample_doc = extracted_sample_doc = [{} for _ in range(10)]
|
933 |
+
|
934 |
+
raw_json = raw_sample_doc[doc_id]
|
935 |
+
extracted_json = extracted_sample_doc[doc_id]
|
936 |
+
return view_data(
|
937 |
+
raw_json,
|
938 |
+
extracted_json,
|
939 |
+
doc_id=doc_id,
|
940 |
+
data_source=data_source,
|
941 |
+
data_sources=data_sources,
|
942 |
+
target=target,
|
943 |
+
)
|
944 |
+
|
945 |
+
|
946 |
def update(target: str, request):
|
947 |
params = request.query_params
|
948 |
if data_source := params.get(f"data_source_{target}"):
|
|
|
1067 |
|
1068 |
|
1069 |
|
1070 |
+
freelaw_examples = Div(
|
1071 |
+
Div(
|
1072 |
+
get_freelaw_data(target=gen_random_id()),
|
1073 |
+
style="border: 1px solid #ccc; padding: 20px;",
|
1074 |
+
),
|
1075 |
+
)
|
1076 |
+
|
1077 |
preprocessing_steps = pd.DataFrame(
|
1078 |
{
|
1079 |
"Step": [
|
|
|
1158 |
plotly2fasthtml(diff2_stacked_bar),
|
1159 |
P("The figure above provides a global view of the document filtering results. ~8% of documents were removed during these three steps."),
|
1160 |
filtering_process,
|
1161 |
+
freelaw_examples,
|
1162 |
data_preparation_div,
|
1163 |
#H2("Local Deduplication"), are these numbers even right?
|
1164 |
#local_dedup_text,
|