File size: 5,485 Bytes
10be4e3
 
 
08d9321
30e2235
 
10be4e3
 
08d9321
 
30e2235
 
 
 
 
 
10be4e3
 
 
 
 
 
 
 
 
1855b51
10be4e3
 
 
1a2fbfc
 
 
 
11be492
1a2fbfc
10be4e3
 
d6cb72b
30e2235
10be4e3
 
 
1a2fbfc
d6cb72b
10be4e3
d6cb72b
10be4e3
 
1a2fbfc
d6cb72b
10be4e3
e98dc14
10be4e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6cb72b
 
10be4e3
 
d6cb72b
 
 
 
 
 
 
 
 
 
10be4e3
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
from datasets import load_from_disk
from pyserini.search.lucene import LuceneSearcher
from pyserini.analysis import JWhiteSpaceAnalyzer
from itertools import chain
from nltk.util import everygrams

searcher = LuceneSearcher("index")
searcher.set_analyzer(JWhiteSpaceAnalyzer())

def tokenize_word(word, min_len=2, max_len=4):
    return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))]

def tokenize_sentence(sentence, min_len=2, max_len=4):
    return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()]))

ds = load_from_disk("data")
NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS. 
RESULTS_PER_PAGE = 5 

TEXT_FIELD = "content"
METADATA_FIELD = "docid"

def result_html(result, meta):
    return (
    f"<div style=\"color:#2a5cb3;font-weight: 500\"><u>docid: {meta}</u></div><br>"
    f"<div><details><summary>{result[:250]}...</summary><p>{result[250:]}</p></details></div><br><hr><br>"
    )

def format_results(results, query):
    text_content = results[TEXT_FIELD]
    query_words = query.split()
    for word in query_words:
        text_content = [text.replace(word, f"<b style=\"color:#2a5cb3;font-weight: 700\">{word}</b>") for text in text_content]
    return "\n".join([result_html(result, meta) for result,meta in zip(text_content, results[METADATA_FIELD])])
    
def page_0(query):
    untokenized_query = query
    query = tokenize_sentence(query)
    hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
    ix = [int(hit.docid) for hit in hits]
    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
    results = format_results(results, untokenized_query)
    return results, [ix], gr.update(visible=True), untokenized_query

def page_i(i, ix, query):
    ix = ix[0]
    results = ds.select(ix).shard(num_shards=NUM_PAGES, index=i, contiguous=True)
    results = format_results(results, query)
    return results, [ix], query
    
with gr.Blocks(css="#b {min-width:15px;background:transparent;}") as demo: #border:white;box-shadow:none;
    with gr.Row():
        gr.Markdown(value="""## <p style="text-align: center;"> Code search </p>""")  
    with gr.Row():
        with gr.Column(scale=1):
            result_list = gr.Dataframe(type="array", visible=False, col_count=1)      
        with gr.Column(scale=13):
            query = gr.Textbox(lines=1, max_lines=1, placeholder="Search…", label="")
        with gr.Column(scale=1):
            with gr.Row(scale=1):
                pass
            with gr.Row(scale=1):    
                submit_btn = gr.Button("🔍", elem_id="b").style(full_width=False)
            with gr.Row(scale=1):
                pass
                
    with gr.Row():
        with gr.Column(scale=1):
            pass
        with gr.Column(scale=13):
            c = gr.HTML(label="Results")
            with gr.Row(visible=False) as pagination:
                # left = gr.Button(value="◀", elem_id="b", visible=False).style(full_width=True)
                page_1 = gr.Button(value="1", elem_id="b").style(full_width=True)
                page_2 = gr.Button(value="2", elem_id="b").style(full_width=True)
                page_3 = gr.Button(value="3", elem_id="b").style(full_width=True)
                page_4 = gr.Button(value="4", elem_id="b").style(full_width=True)
                page_5 = gr.Button(value="5", elem_id="b").style(full_width=True)
                page_6 = gr.Button(value="6", elem_id="b").style(full_width=True)
                page_7 = gr.Button(value="7", elem_id="b").style(full_width=True)
                page_8 = gr.Button(value="8", elem_id="b").style(full_width=True)
                page_9 = gr.Button(value="9", elem_id="b").style(full_width=True)
                page_10 = gr.Button(value="10", elem_id="b").style(full_width=True)
                # right = gr.Button(value="▶", elem_id="b", visible=False).style(full_width=True)
        with gr.Column(scale=1):
            pass
    query.submit(fn=page_0, inputs=[query], outputs=[c, result_list, pagination, query])
    submit_btn.click(page_0, inputs=[query], outputs=[c, result_list, pagination, query])
    with gr.Box(visible=False):
        nums = [gr.Number(i, visible=False, precision=0) for i in range(NUM_PAGES)]
    page_1.click(fn=page_i, inputs=[nums[0], result_list, query], outputs=[c, result_list, query])
    page_2.click(fn=page_i, inputs=[nums[1], result_list, query], outputs=[c, result_list, query])
    page_3.click(fn=page_i, inputs=[nums[2], result_list, query], outputs=[c, result_list, query])
    page_4.click(fn=page_i, inputs=[nums[3], result_list, query], outputs=[c, result_list, query])
    page_5.click(fn=page_i, inputs=[nums[4], result_list, query], outputs=[c, result_list, query])
    page_6.click(fn=page_i, inputs=[nums[5], result_list, query], outputs=[c, result_list, query])
    page_7.click(fn=page_i, inputs=[nums[6], result_list, query], outputs=[c, result_list, query])
    page_8.click(fn=page_i, inputs=[nums[7], result_list, query], outputs=[c, result_list, query])
    page_9.click(fn=page_i, inputs=[nums[8], result_list, query], outputs=[c, result_list, query])
    page_10.click(fn=page_i, inputs=[nums[9], result_list, query], outputs=[c, result_list, query])
demo.launch(enable_queue=True, debug=True)