Almaatla commited on
Commit
c96166d
·
1 Parent(s): f0e5fc8

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -0
app.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PyPDF2 import PdfFileReader
2
+ from tiktoken import Tokenizer
3
+ from tiktoken.models import GPT2
4
+ import gradio as gr
5
+
6
+ def extract_text_from_pdf(file_path):
7
+ with open(file_path, "rb") as file:
8
+ pdf = PdfFileReader(file)
9
+ text = ""
10
+ for page_num in range(pdf.getNumPages()):
11
+ text += pdf.getPage(page_num).extractText()
12
+ return text
13
+
14
+ def count_tokens(text):
15
+ tokenizer = Tokenizer(GPT2())
16
+ tokens = tokenizer.tokenize(text)
17
+ return len(tokens)
18
+
19
+ def count_tokens_in_file(file):
20
+ # Extract text from the PDF file
21
+ paper_text = extract_text_from_pdf(file.name)
22
+ return count_tokens(paper_text)
23
+
24
+ with gr.Blocks() as demo:
25
+ gr.Markdown("Upload your document to count their tokens")
26
+ with gr.Tab("Upload PDF & TXT"):
27
+ docs_input = gr.File(file_count="single", file_types=[".pdf"])
28
+ tb_tokenCount = gr.Textbox(label='Number of tokens')
29
+ btn_count = gr.Button("Count token")
30
+ btn_count.click(count_tokens_in_file,inputs=[docs_input],outputs=[tb_tokenCount])