lambdaofgod commited on
Commit
7606e16
1 Parent(s): f15e1c2
pages/1_Retrieval_App.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, List
3
+
4
+ import datasets
5
+ import pandas as pd
6
+ import sentence_transformers
7
+ import streamlit as st
8
+ from findkit import feature_extractors, indexes, retrieval_pipeline
9
+ from toolz import partial
10
+
11
+
12
+ def truncate_description(description, length=50):
13
+ return " ".join(description.split()[:length])
14
+
15
+
16
+ def get_repos_with_descriptions(repos_df, repos):
17
+ return repos_df.loc[repos]
18
+
19
+
20
+ def search_f(
21
+ retrieval_pipe: retrieval_pipeline.RetrievalPipeline,
22
+ query: str,
23
+ k: int,
24
+ description_length: int,
25
+ doc_col: List[str],
26
+ ):
27
+ results = retrieval_pipe.find_similar(query, k)
28
+ # results['repo'] = results.index
29
+ results["link"] = "https://github.com/" + results["repo"]
30
+ for col in doc_col:
31
+ results[col] = results[col].apply(
32
+ lambda desc: truncate_description(desc, description_length)
33
+ )
34
+ shown_cols = ["repo", "tasks", "link", "distance"]
35
+ shown_cols = shown_cols + doc_col
36
+ return results.reset_index(drop=True)[shown_cols]
37
+
38
+
39
+ def show_retrieval_results(
40
+ retrieval_pipe: retrieval_pipeline.RetrievalPipeline,
41
+ query: str,
42
+ k: int,
43
+ all_queries: List[str],
44
+ description_length: int,
45
+ repos_by_query: Dict[str, pd.DataFrame],
46
+ doc_col: str,
47
+ ):
48
+ print("started retrieval")
49
+ if query in all_queries:
50
+ with st.expander(
51
+ "query is in gold standard set queries. Toggle viewing gold standard results?"
52
+ ):
53
+ st.write("gold standard results")
54
+ task_repos = repos_by_query.get_group(query)
55
+ st.table(get_repos_with_descriptions(retrieval_pipe.X_df, task_repos))
56
+ with st.spinner(text="fetching results"):
57
+ st.write(
58
+ search_f(retrieval_pipe, query, k, description_length, doc_col).to_html(
59
+ escape=False, index=False
60
+ ),
61
+ unsafe_allow_html=True,
62
+ )
63
+ print("finished retrieval")
64
+
65
+
66
+ def setup_pipeline(
67
+ extractor: feature_extractors.SentenceEncoderFeatureExtractor,
68
+ documents_df: pd.DataFrame,
69
+ text_col: str,
70
+ ):
71
+ retrieval_pipeline.RetrievalPipelineFactory.build(
72
+ documents_df[text_col], metadata=documents_df
73
+ )
74
+
75
+
76
+ @st.cache
77
+ def setup_retrieval_pipeline(
78
+ query_encoder_path, document_encoder_path, documents, metadata
79
+ ):
80
+ document_encoder = feature_extractors.SentenceEncoderFeatureExtractor(
81
+ sentence_transformers.SentenceTransformer(document_encoder_path, device="cpu")
82
+ )
83
+ query_encoder = feature_extractors.SentenceEncoderFeatureExtractor(
84
+ sentence_transformers.SentenceTransformer(query_encoder_path, device="cpu")
85
+ )
86
+ retrieval_pipe = retrieval_pipeline.RetrievalPipelineFactory(
87
+ feature_extractor=document_encoder,
88
+ query_feature_extractor=query_encoder,
89
+ index_factory=partial(indexes.NMSLIBIndex.build, distance="cosinesimil"),
90
+ )
91
+ return retrieval_pipe.build(documents, metadata=metadata)
92
+
93
+
94
+ def app(retrieval_pipeline, retrieval_df, doc_col):
95
+
96
+ retrieved_results = st.sidebar.number_input("number of results", value=10)
97
+ description_length = st.sidebar.number_input(
98
+ "number of used description words", value=10
99
+ )
100
+
101
+ tasks_deduped = (
102
+ retrieval_df["tasks"].explode().value_counts().reset_index()
103
+ ) # drop_duplicates().sort_values().reset_index(drop=True)
104
+ tasks_deduped.columns = ["task", "documents per task"]
105
+ with st.sidebar.expander("View test set queries"):
106
+ st.table(tasks_deduped.explode("task"))
107
+
108
+ additional_shown_cols = st.sidebar.multiselect(
109
+ label="additional cols", options=[doc_col], default=doc_col
110
+ )
111
+
112
+ repos_by_query = retrieval_df.explode("tasks").groupby("tasks")
113
+ query = st.text_input("input query", value="metric learning")
114
+ show_retrieval_results(
115
+ retrieval_pipeline,
116
+ query,
117
+ retrieved_results,
118
+ tasks_deduped["task"].to_list(),
119
+ description_length,
120
+ repos_by_query,
121
+ additional_shown_cols,
122
+ )
123
+
124
+
125
+ def app_main(
126
+ query_encoder_path,
127
+ document_encoder_path,
128
+ data_path,
129
+ ):
130
+ print("loading data")
131
+
132
+ retrieval_df = datasets.load_dataset(data_path)["train"].to_pandas()
133
+ print("setting up retrieval_pipe")
134
+ doc_col = "dependencies"
135
+ retrieval_pipeline = setup_retrieval_pipeline(
136
+ query_encoder_path, document_encoder_path, retrieval_df[doc_col], retrieval_df
137
+ )
138
+ app(retrieval_pipeline, retrieval_df, doc_col)
139
+
140
+
141
+ app_main(
142
+ query_encoder_path="lambdaofgod/query_nbow_1_2000",
143
+ document_encoder_path="lambdaofgod/document_nbow_1_2000",
144
+ data_path="lambdaofgod/pwc_repositories_with_dependencies",
145
+ )
pages/2_Statistics.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+
4
+ best_results_df = pd.read_csv("output/best_tasks_with_hits.csv")
5
+
6
+
7
+ worst_results_df = pd.read_csv(
8
+ "output/worst_tasks_with_hits.csv"
9
+ ) # , data_path="output/papers_with_dependencies.csv",
10
+
11
+ show_worst_best_statistics = st.sidebar.checkbox(
12
+ label="show worst/best statistics grouped by area"
13
+ )
14
+
15
+ show_area_aggregated_results = st.sidebar.checkbox(
16
+ label="show results aggregated by area"
17
+ )
18
+ if show_worst_best_statistics:
19
+ st.markdown("""
20
+ ## Worst/best queries
21
+ The following are top 10 worst/best queries per area by number of hits.
22
+ There are at least 10 documents per query in the test set, so number of hits/10 is the accuracy.
23
+ """)
24
+ sort_key = st.selectbox("sort by", list(best_results_df.columns))
25
+ st.markdown("## Queries with best results")
26
+ st.table(best_results_df.sort_values(sort_key, ascending=False))
27
+ st.markdown("## Queries with worst results")
28
+ st.table(worst_results_df.sort_values(sort_key, ascending=False))
29
+
30
+ if show_area_aggregated_results:
31
+ st.markdown("## Area aggregated results")
32
+ best_results_agg = best_results_df.groupby("area").agg("mean").reset_index()
33
+ worst_results_agg = worst_results_df.groupby("area").agg("mean").reset_index()
34
+ sort_key = st.selectbox("sort by", list(best_results_agg.columns))
35
+ st.markdown("Best results")
36
+ st.table(best_results_agg.sort_values(sort_key, ascending=False))
37
+ st.markdown("Worst results")
38
+ st.table(worst_results_agg.sort_values(sort_key, ascending=False))
39
+
project_retrieval_app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ st.set_page_config(page_title="Start")
4
+ st.markdown("""
5
+ # Searching Python projects with neural networks
6
+
7
+ ## Authors
8
+
9
+ - Jakub Bartczuk
10
+ - Paweł Rychlikowski (promotor)
11
+
12
+ ## Motivation
13
+ The following application illustrates neural network based models for searching github.
14
+
15
+ With over 500 starred repositories searching through them became cumbersome. I did a [small project for retrieval on starred repositories](https://github.com/lambdaofgod/examples-counterexamples/blob/master/notebooks/text_mining/Github_Starred_Repositories.ipynb) which looked promising, but it is hard to gauge how useful such solution would be in practice.
16
+
17
+ In the thesis I use [PapersWithCode](https://paperswithcode.com/) data for information retrieval.
18
+
19
+ PapersWithCode contains links between papers and repositories that implement them. Most repositories are tagged with at least one task like "unsupervised segmentation" or "semantic parsing".
20
+
21
+ Tasks are research topics like "object detection" or "multivariate time series imputation".
22
+
23
+ ## Features
24
+ - [x] Searching using Neural Bag of Words features
25
+ - [ ] Searching using selectable model
26
+ - [ ] add Word2Vec on READMEs
27
+
28
+ """)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ -e git+https://github.com/lambdaofgod/findkit#egg=findkit
2
+ sentence-transformers==2.2.2
3
+ toolz