Spaces:
Running
Running
File size: 6,095 Bytes
5ad8cbc acfde36 5ad8cbc acfde36 5ad8cbc acfde36 5ad8cbc acfde36 5ad8cbc d6e3fde 5ad8cbc f602886 5ad8cbc be6e01f 5ad8cbc be6e01f 5ad8cbc be6e01f 5ad8cbc be6e01f 5ad8cbc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import huggingface_hub as hf
import gradio as gr
import os, datetime
fs = hf.HfFileSystem(token=os.environ["HF_TOKEN"])
datasetdir = "datasets/yoinked/blue-arxiv-papers/"
basecss = """
.caaard-container {
width: 250px;
padding: 20px;
border: 3px solid black;
border-radius: 15px;
text-align: left;
}
.title {
font-size: 24px;
margin-bottom: 10px;
text-align: center;
}
.caaard-containers {
display: flex; gap: 20px; flex-wrap: wrap;
}
.extra-info {
font-size: 14px;
line-height: 1.5;
}
.extra-info-paperid {
font-size: 14px;
line-height: 1.5;
color: #222;
}"""
jscode = """
function copyToClipboard(container) {
const titleElement = container.querySelector(".title");
const titleText = titleElement.textContent;
const tempTextArea = document.createElement("textarea");
tempTextArea.value = titleText;
document.body.appendChild(tempTextArea);
tempTextArea.select();
document.execCommand("copy");
document.body.removeChild(tempTextArea);
}
"""
def get_papers():
return fs.glob(datasetdir+"**.md")
def get_papers_metadata(papiers=None):
metadatas = []
if papiers is None:
papiers = get_papers()
for paper in papiers:
papertxt = fs.read(paper)
metadata = papertxt.split("---")[1]
try:
author = metadata.split("author: ")[1].split("\n")[0]
except:
author = "unknown"
try:
title = metadata.split("title: ")[1].split("\n")[0]
except:
title = "unknown"
try:
tags = metadata.split("tags: ")[1].split("\n")[0].split(", ")
except:
tags = []
try:
abstract = metadata.split("abstract: ")[1].split("\n")[0]
except:
abstract = "unknown"
try:
date_published = metadata.split("date_published: ")[1].split("\n")[0]
except:
date_published = "unknown"
try:
paperid = metadata.split("paperid: ")[1].split("\n")[0] #if this fails then no reason to display
md = {"fname": paper, "metadata": metadata, "author": author, "title": title, "tags": tags, "abstract": abstract, "date_published": date_published, "paperid": paperid}
metadatas.append(md)
except:
pass
return metadatas
def make_paper_card(md):
html = f"""
<div class="caaard-container" onclick="copyToClipboard(this)" title="{md["abstract"]}">>
<div class="title">{md["title"]}</div>
<br><br>
<div class="extra-info">author: {md["author"]}</div>
<div class="extra-info">published: {md["date_published"]}</div>
<div class="extra-info-paperid">id: {md["paperid"]}</div>
</div>
"""
return html
def make_paper_cards(tags=None):
mds = get_papers_metadata()
tags = tags.split(",")
tags = [tag.strip() for tag in tags]
if tags is not None:
mds = [md for md in mds if any(tag in md["tags"] for tag in tags)]
htmls = [make_paper_card(md) for md in mds]
fin = "<div class='caaard-containers'>"
for html in htmls:
fin += html + "<br>"
fin += "</div>"
return fin
def get_paper_markdown(paperid):
allpapers = get_papers_metadata()
fname = None
for paper in allpapers:
if paper["paperid"] == paperid:
fname = paper["fname"]
break
if fname is None:
return "## paper not found"
else:
paper = fs.read(fname).split("---")[2]
return paper
def publish_paper(title, authors, tags, abst, data):
paperid = ""
year = datetime.datetime.now().year
month = datetime.datetime.now().month
if month < 10:
month = "0"+str(month)
day = datetime.datetime.now().day
if day < 10:
day = "0"+str(day)
idx = 1
while True:
paperid = f"{year}-{month}{day}.{idx}"
if not fs.exists(datasetdir+paperid+".md"):
break
idx += 1
if idx > 100:
return "could not generate paperid, try again tomorrow"
bad_chars = "<>:|\\" # primitive anti-xss sanitization
for c in bad_chars:
title = title.replace(c, "")
authors = authors.replace(c, "")
tags = tags.replace(c, "")
abst = abst.replace(c, "")
metadata = f"""---
title: {title}
author: {authors}
tags: {tags}
abstract: {abst}
date_published: {year}-{month}-{day}
paperid: {paperid}
---\n"""
with fs.open(datasetdir+paperid+".md", "w") as f:
raw = metadata + data
f.write(raw)
def makepreview(x):
return x
with gr.Blocks(css=basecss, js=jscode, theme='NoCrypt/miku') as demo:
with gr.Tab("search"):
with gr.Row():
query = gr.Textbox(label="tags (optional, comma seperated)", lines=1, interactive=True)
searchbutton = gr.Button("search")
with gr.Row():
papercards = gr.HTML()
with gr.Tab("read"):
with gr.Row():
paperid = gr.Textbox(label="paper id", lines=1, interactive=True)
readbutton = gr.Button("read")
with gr.Row():
paper = gr.Markdown()
with gr.Tab("publish"):
with gr.Row():
title = gr.Textbox(label="title", lines=1, interactive=True)
authors = gr.Textbox(label="author(s)", lines=1, interactive=True)
with gr.Row():
tags = gr.Textbox(label="tags (optional, comma seperated)", lines=1, interactive=True)
abst = gr.Textbox(label="abriged abstract (aka tooltip)", lines=2, interactive=True)
markd = gr.Textbox(label="markdown", lines=10, interactive=True, max_lines=1e3)
preview = gr.Markdown()
with gr.Row():
status = gr.Textbox(label="status", lines=1, interactive=False)
publishbutton = gr.Button("publish")
markd.change(fn=makepreview, inputs=markd, outputs=preview)
publishbutton.click(fn=publish_paper, inputs=[title, authors, tags, abst, markd], outputs=status)
searchbutton.click(fn=make_paper_cards, inputs=query, outputs=papercards)
readbutton.click(fn=get_paper_markdown, inputs=paperid, outputs=paper)
demo.launch() |