Spaces:
Sleeping
Sleeping
File size: 5,485 Bytes
10be4e3 08d9321 30e2235 10be4e3 08d9321 30e2235 10be4e3 1855b51 10be4e3 1a2fbfc 11be492 1a2fbfc 10be4e3 d6cb72b 30e2235 10be4e3 1a2fbfc d6cb72b 10be4e3 d6cb72b 10be4e3 1a2fbfc d6cb72b 10be4e3 e98dc14 10be4e3 d6cb72b 10be4e3 d6cb72b 10be4e3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
from datasets import load_from_disk
from pyserini.search.lucene import LuceneSearcher
from pyserini.analysis import JWhiteSpaceAnalyzer
from itertools import chain
from nltk.util import everygrams
searcher = LuceneSearcher("index")
searcher.set_analyzer(JWhiteSpaceAnalyzer())
def tokenize_word(word, min_len=2, max_len=4):
return [''.join(ngram) for ngram in list(everygrams(word, min_len=min_len, max_len=max_len))]
def tokenize_sentence(sentence, min_len=2, max_len=4):
return " ".join(chain(*[tokenize_word(word, min_len=min_len, max_len=max_len) for word in sentence.split()]))
ds = load_from_disk("data")
NUM_PAGES = 10 # STATIC. THIS CAN'T CHANGE BECAUSE GRADIO CAN'T DYNAMICALLY CREATE COMPONENTS.
RESULTS_PER_PAGE = 5
TEXT_FIELD = "content"
METADATA_FIELD = "docid"
def result_html(result, meta):
return (
f"<div style=\"color:#2a5cb3;font-weight: 500\"><u>docid: {meta}</u></div><br>"
f"<div><details><summary>{result[:250]}...</summary><p>{result[250:]}</p></details></div><br><hr><br>"
)
def format_results(results, query):
text_content = results[TEXT_FIELD]
query_words = query.split()
for word in query_words:
text_content = [text.replace(word, f"<b style=\"color:#2a5cb3;font-weight: 700\">{word}</b>") for text in text_content]
return "\n".join([result_html(result, meta) for result,meta in zip(text_content, results[METADATA_FIELD])])
def page_0(query):
untokenized_query = query
query = tokenize_sentence(query)
hits = searcher.search(query, k=NUM_PAGES*RESULTS_PER_PAGE)
ix = [int(hit.docid) for hit in hits]
results = ds.select(ix).shard(num_shards=NUM_PAGES, index=0, contiguous=True) # no need to shard. split ix in batches instead. (would make sense if results was cacheable)
results = format_results(results, untokenized_query)
return results, [ix], gr.update(visible=True), untokenized_query
def page_i(i, ix, query):
ix = ix[0]
results = ds.select(ix).shard(num_shards=NUM_PAGES, index=i, contiguous=True)
results = format_results(results, query)
return results, [ix], query
with gr.Blocks(css="#b {min-width:15px;background:transparent;}") as demo: #border:white;box-shadow:none;
with gr.Row():
gr.Markdown(value="""## <p style="text-align: center;"> Code search </p>""")
with gr.Row():
with gr.Column(scale=1):
result_list = gr.Dataframe(type="array", visible=False, col_count=1)
with gr.Column(scale=13):
query = gr.Textbox(lines=1, max_lines=1, placeholder="Search…", label="")
with gr.Column(scale=1):
with gr.Row(scale=1):
pass
with gr.Row(scale=1):
submit_btn = gr.Button("🔍", elem_id="b").style(full_width=False)
with gr.Row(scale=1):
pass
with gr.Row():
with gr.Column(scale=1):
pass
with gr.Column(scale=13):
c = gr.HTML(label="Results")
with gr.Row(visible=False) as pagination:
# left = gr.Button(value="◀", elem_id="b", visible=False).style(full_width=True)
page_1 = gr.Button(value="1", elem_id="b").style(full_width=True)
page_2 = gr.Button(value="2", elem_id="b").style(full_width=True)
page_3 = gr.Button(value="3", elem_id="b").style(full_width=True)
page_4 = gr.Button(value="4", elem_id="b").style(full_width=True)
page_5 = gr.Button(value="5", elem_id="b").style(full_width=True)
page_6 = gr.Button(value="6", elem_id="b").style(full_width=True)
page_7 = gr.Button(value="7", elem_id="b").style(full_width=True)
page_8 = gr.Button(value="8", elem_id="b").style(full_width=True)
page_9 = gr.Button(value="9", elem_id="b").style(full_width=True)
page_10 = gr.Button(value="10", elem_id="b").style(full_width=True)
# right = gr.Button(value="▶", elem_id="b", visible=False).style(full_width=True)
with gr.Column(scale=1):
pass
query.submit(fn=page_0, inputs=[query], outputs=[c, result_list, pagination, query])
submit_btn.click(page_0, inputs=[query], outputs=[c, result_list, pagination, query])
with gr.Box(visible=False):
nums = [gr.Number(i, visible=False, precision=0) for i in range(NUM_PAGES)]
page_1.click(fn=page_i, inputs=[nums[0], result_list, query], outputs=[c, result_list, query])
page_2.click(fn=page_i, inputs=[nums[1], result_list, query], outputs=[c, result_list, query])
page_3.click(fn=page_i, inputs=[nums[2], result_list, query], outputs=[c, result_list, query])
page_4.click(fn=page_i, inputs=[nums[3], result_list, query], outputs=[c, result_list, query])
page_5.click(fn=page_i, inputs=[nums[4], result_list, query], outputs=[c, result_list, query])
page_6.click(fn=page_i, inputs=[nums[5], result_list, query], outputs=[c, result_list, query])
page_7.click(fn=page_i, inputs=[nums[6], result_list, query], outputs=[c, result_list, query])
page_8.click(fn=page_i, inputs=[nums[7], result_list, query], outputs=[c, result_list, query])
page_9.click(fn=page_i, inputs=[nums[8], result_list, query], outputs=[c, result_list, query])
page_10.click(fn=page_i, inputs=[nums[9], result_list, query], outputs=[c, result_list, query])
demo.launch(enable_queue=True, debug=True) |