Spaces:

Tonic
/

OCRonos-TextGen

Sleeping

App Files Files Community

Tonic commited on Sep 9, 2024

Commit

2ca0200

verified ·

1 Parent(s): 8c1d821

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -14

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ tokenizer.pad_token = tokenizer.eos_token
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
 def historical_generation(prompt, max_new_tokens=600):
     prompt = f"### Text ###\n{prompt}"
     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
@@ -37,9 +38,10 @@ def historical_generation(prompt, max_new_tokens=600):
     # Decode the generated text
     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
-    # Remove the prompt from the generated text
-    generated_text = generated_text.replace("### Text ###\n", "").strip()
     # Tokenize the generated text
     tokens = tokenizer.tokenize(generated_text)
@@ -47,17 +49,46 @@ def historical_generation(prompt, max_new_tokens=600):
     # Create highlighted text output
     highlighted_text = []
     for token in tokens:
-        # Remove special tokens and get the token type
-        clean_token = token.replace("Ġ", "").replace("</w>", "")
         token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
         highlighted_text.append((clean_token, token_type))
     return highlighted_text
 # Create Gradio interface
 iface = gr.Interface(
-    fn=historical_generation,
     inputs=[
         gr.Textbox(
             label="Prompt",
@@ -72,15 +103,106 @@ iface = gr.Interface(
             value=600
         )
     ],
-    outputs=gr.HighlightedText(
-        label="Generated Historical Text",
-        combine_adjacent=True,
-        show_legend=True
-    ),
     title="Historical Text Generation with OCRonos-Vintage",
-    description="Generate historical-style text using the OCRonos-Vintage model. The output shows token types as highlights.",
     theme=gr.themes.Base()
 )
 if __name__ == "__main__":
-    iface.launch()

 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model.to(device)
+# Function for generating text
 def historical_generation(prompt, max_new_tokens=600):
     prompt = f"### Text ###\n{prompt}"
     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
     # Decode the generated text
     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+    # Extract text after "### Correction ###"
+    if "### Correction ###" in generated_text:
+        generated_text = generated_text.split("### Correction ###")[1].strip()
     # Tokenize the generated text
     tokens = tokenizer.tokenize(generated_text)
     # Create highlighted text output
     highlighted_text = []
     for token in tokens:
+        # Clean token and get token type
+        clean_token = token.replace("Ġ", "")
         token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
         highlighted_text.append((clean_token, token_type))
     return highlighted_text
+# Tokenizer information display
+import os
+os.system('python -m spacy download en_core_web_sm')
+import spacy
+from spacy import displacy
+nlp = spacy.load("en_core_web_sm")
+def text_analysis(text):
+    doc = nlp(text)
+    html = displacy.render(doc, style="dep", page=True)
+    html = (
+        "<div style='max-width:100%; max-height:360px; overflow:auto'>"
+        + html
+        + "</div>"
+    )
+    pos_count = {
+        "char_count": len(text),
+        "token_count": len(list(doc)),
+    }
+    pos_tokens = [(token.text, token.pos_) for token in doc]
+    return pos_tokens, pos_count, html
+# Gradio interface for text analysis
+def full_interface(prompt, max_new_tokens):
+    generated_highlight = historical_generation(prompt, max_new_tokens)
+    tokens, pos_count, html = text_analysis(prompt)
+    return generated_highlight, pos_count, html
 # Create Gradio interface
 iface = gr.Interface(
+    fn=full_interface,
     inputs=[
         gr.Textbox(
             label="Prompt",
             value=600
         )
     ],
+    outputs=[
+        gr.HighlightedText(
+            label="Generated Historical Text",
+            combine_adjacent=True,
+            show_legend=True
+        ),
+        gr.JSON(label="Tokenizer Info"),
+        gr.HTML(label="Dependency Parse Visualization")
+    ],
     title="Historical Text Generation with OCRonos-Vintage",
+    description="Generate historical-style text using OCRonos-Vintage and analyze the tokenizer output.",
     theme=gr.themes.Base()
 )
 if __name__ == "__main__":
+    iface.launch()
+# import torch
+# from transformers import GPT2LMHeadModel, GPT2Tokenizer
+# import gradio as gr
+# Load pre-trained model and tokenizer
+# model_name = "PleIAs/OCRonos-Vintage"
+# model = GPT2LMHeadModel.from_pretrained(model_name)
+# tokenizer = GPT2Tokenizer.from_pretrained(model_name)
+# Set the pad token to be the same as the eos token
+# tokenizer.pad_token = tokenizer.eos_token
+# Set the device to GPU if available, otherwise use CPU
+# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# model.to(device)
+# def historical_generation(prompt, max_new_tokens=600):
+#     prompt = f"### Text ###\n{prompt}"
+#     inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=1024)
+#     input_ids = inputs["input_ids"].to(device)
+#     attention_mask = inputs["attention_mask"].to(device)
+    # Generate text
+#     output = model.generate(
+#         input_ids,
+#         attention_mask=attention_mask,
+#         max_new_tokens=max_new_tokens,
+#         pad_token_id=tokenizer.eos_token_id,
+#         top_k=50,
+#         temperature=0.3,
+#         top_p=0.95,
+#         do_sample=True,
+#         repetition_penalty=1.5,
+#         bos_token_id=tokenizer.bos_token_id,
+#         eos_token_id=tokenizer.eos_token_id
+#     )
+    # Decode the generated text
+#     generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
+    # Remove the prompt from the generated text
+#     generated_text = generated_text.replace("### Text ###\n", "").strip()
+    # Tokenize the generated text
+#     tokens = tokenizer.tokenize(generated_text)
+    # Create highlighted text output
+#     highlighted_text = []
+#     for token in tokens:
+        # Remove special tokens and get the token type
+#         clean_token = token.replace("Ġ", "").replace("</w>", "")
+#         token_type = tokenizer.convert_ids_to_tokens([tokenizer.convert_tokens_to_ids(token)])[0]
+#         highlighted_text.append((clean_token, token_type))
+#     return highlighted_text
+# Create Gradio interface
+# iface = gr.Interface(
+#     fn=historical_generation,
+#     inputs=[
+#         gr.Textbox(
+#             label="Prompt",
+#             placeholder="Enter a prompt for historical text generation...",
+#             lines=3
+#         ),
+#         gr.Slider(
+#             label="Max New Tokens",
+#             minimum=50,
+#             maximum=1000,
+#             step=50,
+#             value=600
+#         )
+#     ],
+#     outputs=gr.HighlightedText(
+#         label="Generated Historical Text",
+#         combine_adjacent=True,
+#         show_legend=True
+#     ),
+#     title="Historical Text Generation with OCRonos-Vintage",
+#     description="Generate historical-style text using the OCRonos-Vintage model. The output shows token types as highlights.",
+#     theme=gr.themes.Base()
+# )
+# if __name__ == "__main__":
+  #   iface.launch()