mylibrar commited on
Commit
3dd0859
·
1 Parent(s): cb27b88

Add comments to each topic graph

Browse files
Files changed (2) hide show
  1. data/topic_charts.json +40 -20
  2. results.py +3 -3
data/topic_charts.json CHANGED
@@ -149,7 +149,8 @@
149
  ],
150
  "pctdistance": 1.2,
151
  "labeldistance": 1.5
152
- }
 
153
  }
154
  ],
155
  [
@@ -304,7 +305,8 @@
304
  "subplots_adjust": {
305
  "left": 0.37,
306
  "right": 0.98
307
- }
 
308
  }
309
  ],
310
  [
@@ -459,7 +461,8 @@
459
  "subplots_adjust": {
460
  "left": 0.37,
461
  "right": 0.98
462
- }
 
463
  }
464
  ],
465
  [
@@ -614,7 +617,8 @@
614
  "subplots_adjust": {
615
  "left": 0.37,
616
  "right": 0.98
617
- }
 
618
  }
619
  ],
620
  [
@@ -769,7 +773,8 @@
769
  "subplots_adjust": {
770
  "left": 0.37,
771
  "right": 0.98
772
- }
 
773
  }
774
  ],
775
  [
@@ -924,7 +929,8 @@
924
  "subplots_adjust": {
925
  "left": 0.37,
926
  "right": 0.98
927
- }
 
928
  }
929
  ],
930
  [
@@ -1079,7 +1085,8 @@
1079
  "subplots_adjust": {
1080
  "left": 0.37,
1081
  "right": 0.98
1082
- }
 
1083
  }
1084
  ],
1085
  [
@@ -1234,7 +1241,8 @@
1234
  "subplots_adjust": {
1235
  "left": 0.37,
1236
  "right": 0.98
1237
- }
 
1238
  }
1239
  ],
1240
  [
@@ -1389,7 +1397,8 @@
1389
  "subplots_adjust": {
1390
  "left": 0.37,
1391
  "right": 0.98
1392
- }
 
1393
  }
1394
  ],
1395
  [
@@ -1544,7 +1553,8 @@
1544
  "subplots_adjust": {
1545
  "left": 0.37,
1546
  "right": 0.98
1547
- }
 
1548
  }
1549
  ],
1550
  [
@@ -1699,7 +1709,8 @@
1699
  "subplots_adjust": {
1700
  "left": 0.37,
1701
  "right": 0.98
1702
- }
 
1703
  }
1704
  ],
1705
  [
@@ -1854,7 +1865,8 @@
1854
  "subplots_adjust": {
1855
  "left": 0.37,
1856
  "right": 0.98
1857
- }
 
1858
  }
1859
  ],
1860
  [
@@ -2009,7 +2021,8 @@
2009
  "subplots_adjust": {
2010
  "left": 0.37,
2011
  "right": 0.98
2012
- }
 
2013
  }
2014
  ],
2015
  [
@@ -2164,7 +2177,8 @@
2164
  "subplots_adjust": {
2165
  "left": 0.37,
2166
  "right": 0.98
2167
- }
 
2168
  }
2169
  ],
2170
  [
@@ -2319,7 +2333,8 @@
2319
  "subplots_adjust": {
2320
  "left": 0.37,
2321
  "right": 0.98
2322
- }
 
2323
  }
2324
  ],
2325
  [
@@ -2474,7 +2489,8 @@
2474
  "subplots_adjust": {
2475
  "left": 0.37,
2476
  "right": 0.98
2477
- }
 
2478
  }
2479
  ],
2480
  [
@@ -2629,7 +2645,8 @@
2629
  "subplots_adjust": {
2630
  "left": 0.37,
2631
  "right": 0.98
2632
- }
 
2633
  }
2634
  ],
2635
  [
@@ -2784,7 +2801,8 @@
2784
  "subplots_adjust": {
2785
  "left": 0.37,
2786
  "right": 0.98
2787
- }
 
2788
  }
2789
  ],
2790
  [
@@ -2939,7 +2957,8 @@
2939
  "subplots_adjust": {
2940
  "left": 0.37,
2941
  "right": 0.98
2942
- }
 
2943
  }
2944
  ],
2945
  [
@@ -3094,7 +3113,8 @@
3094
  "subplots_adjust": {
3095
  "left": 0.37,
3096
  "right": 0.98
3097
- }
 
3098
  }
3099
  ],
3100
  [
 
149
  ],
150
  "pctdistance": 1.2,
151
  "labeldistance": 1.5
152
+ },
153
+ "comment": "As shown in the graph above, over 20% of the documents are related to Business & Economics & Finance, which makes it the largest topic group in dataset. On the contrary, the group of Culture & Cultural geography contains the smallest number of documents among all topics."
154
  }
155
  ],
156
  [
 
305
  "subplots_adjust": {
306
  "left": 0.37,
307
  "right": 0.98
308
+ },
309
+ "comment": "In average, documents related to Shopping & Commodity have larger fraction of words corrected in lines."
310
  }
311
  ],
312
  [
 
461
  "subplots_adjust": {
462
  "left": 0.37,
463
  "right": 0.98
464
+ },
465
+ "comment": "Compared with other topics, Personal Development & Human Resources & Career in average contain more lines ending with ellipsis."
466
  }
467
  ],
468
  [
 
617
  "subplots_adjust": {
618
  "left": 0.37,
619
  "right": 0.98
620
+ },
621
+ "comment": "Shopping & Commodity related documents have higher percentage of lines starting with bullet point."
622
  }
623
  ],
624
  [
 
773
  "subplots_adjust": {
774
  "left": 0.37,
775
  "right": 0.98
776
+ },
777
+ "comment": "Personal Development & Human Resources & Career in average has more lines with toxic words."
778
  }
779
  ],
780
  [
 
929
  "subplots_adjust": {
930
  "left": 0.37,
931
  "right": 0.98
932
+ },
933
+ "comment": "Daily Life & Home & Lifestyle in average has more toxic words."
934
  }
935
  ],
936
  [
 
1085
  "subplots_adjust": {
1086
  "left": 0.37,
1087
  "right": 0.98
1088
+ },
1089
+ "comment": "Documents in the topic of Personal Development & Human Resources & Career in average contain more words than other topics."
1090
  }
1091
  ],
1092
  [
 
1241
  "subplots_adjust": {
1242
  "left": 0.37,
1243
  "right": 0.98
1244
+ },
1245
+ "comment": "There is no significant variance in the average word length for different topic groups. However, Education related data contain longer words than others in general."
1246
  }
1247
  ],
1248
  [
 
1397
  "subplots_adjust": {
1398
  "left": 0.37,
1399
  "right": 0.98
1400
+ },
1401
+ "comment": "Documents in the topic of Personal Development & Human Resources & Career usually contain more sentences."
1402
  }
1403
  ],
1404
  [
 
1553
  "subplots_adjust": {
1554
  "left": 0.37,
1555
  "right": 0.98
1556
+ },
1557
+ "comment": "Documents related to Daily Life & Home & Lifestyle usually have higher percentage of symbols."
1558
  }
1559
  ],
1560
  [
 
1709
  "subplots_adjust": {
1710
  "left": 0.37,
1711
  "right": 0.98
1712
+ },
1713
+ "comment": "The fraction of words with alpha character seems to be relatively consistent across different topics."
1714
  }
1715
  ],
1716
  [
 
1865
  "subplots_adjust": {
1866
  "left": 0.37,
1867
  "right": 0.98
1868
+ },
1869
+ "comment": "Culture & Cultural geography contains more stop words in average."
1870
  }
1871
  ],
1872
  [
 
2021
  "subplots_adjust": {
2022
  "left": 0.37,
2023
  "right": 0.98
2024
+ },
2025
+ "comment": "Natural Science & Formal Science & Technology has a significantly higher rate in percentage of documents that contain curly bracket. It might be related to the coding data."
2026
  }
2027
  ],
2028
  [
 
2177
  "subplots_adjust": {
2178
  "left": 0.37,
2179
  "right": 0.98
2180
+ },
2181
+ "comment": "Sports related documents have a higher number of duplication count."
2182
  }
2183
  ],
2184
  [
 
2333
  "subplots_adjust": {
2334
  "left": 0.37,
2335
  "right": 0.98
2336
+ },
2337
+ "comment": "In average, Culture & Cultural geography related documents are duplicated across a higher number of common crawl dumps. Duplication of Shopping & Commodity appears in less dumps than others."
2338
  }
2339
  ],
2340
  [
 
2489
  "subplots_adjust": {
2490
  "left": 0.37,
2491
  "right": 0.98
2492
+ },
2493
+ "comment": "In average, Culture & Cultural geography related documents are duplicated across more years than other topics."
2494
  }
2495
  ],
2496
  [
 
2645
  "subplots_adjust": {
2646
  "left": 0.37,
2647
  "right": 0.98
2648
+ },
2649
+ "comment": "In average, Culture & Cultural geography related documents are duplicated across a wider span of years."
2650
  }
2651
  ],
2652
  [
 
2801
  "subplots_adjust": {
2802
  "left": 0.37,
2803
  "right": 0.98
2804
+ },
2805
+ "comment": "Average language scores of different topic groups are mostly consistent. No significant differences are obeserved."
2806
  }
2807
  ],
2808
  [
 
2957
  "subplots_adjust": {
2958
  "left": 0.37,
2959
  "right": 0.98
2960
+ },
2961
+ "comment": "In average, Shopping & Commodity has a larger fraction of duplicate lines than others."
2962
  }
2963
  ],
2964
  [
 
3113
  "subplots_adjust": {
3114
  "left": 0.37,
3115
  "right": 0.98
3116
+ },
3117
+ "comment": "Shopping & Commodity usually has a larger fraction of characters in duplicate lines than others."
3118
  }
3119
  ],
3120
  [
results.py CHANGED
@@ -990,7 +990,7 @@ for title, data in topic_charts:
990
  for rgb in data["kwargs"]["color"]
991
  ]
992
  )))
993
- else:
994
  topic_graphs.append(go.Figure(go.Pie(
995
  values=data["kwargs"]['x'],
996
  labels=data["kwargs"]["labels"],
@@ -1014,8 +1014,8 @@ cluster_div = Div(
1014
  )),
1015
  H3("Results Analysis"),
1016
  *(
1017
- Section(H4(title), plotly2fasthtml(topic_graphs[i]))
1018
- for i, (title, _) in enumerate(topic_charts)
1019
  )
1020
  )
1021
  )
 
990
  for rgb in data["kwargs"]["color"]
991
  ]
992
  )))
993
+ elif data["type"] == "pie":
994
  topic_graphs.append(go.Figure(go.Pie(
995
  values=data["kwargs"]['x'],
996
  labels=data["kwargs"]["labels"],
 
1014
  )),
1015
  H3("Results Analysis"),
1016
  *(
1017
+ Section(H4(title), plotly2fasthtml(topic_graphs[i]), P(data.get("comment", '')))
1018
+ for i, (title, data) in enumerate(topic_charts)
1019
  )
1020
  )
1021
  )