import gradio as gr from datasets import load_dataset import tempfile import re # List of common titles that end with a period TITLES = {"Mr.", "Mrs.", "Ms.", "Dr.", "Prof.", "Rev.", "Sr.", "Jr."} def is_latin(text): """Check if the text contains only Latin characters.""" # Regex to match non-Latin characters return not re.search(r'[^\x00-\x7F]', text) def clean_text(text): """Remove non-Latin text and ** from the text.""" # Remove ** text = re.sub(r'\*\*', '', text) # Split text into sentences and filter out non-Latin sentences sentences = re.split(r'(?<=[.!?])\s+', text) cleaned_sentences = [s for s in sentences if is_latin(s)] return ' '.join(cleaned_sentences) def process_text(text): """Insert a newline after periods, except for titles and ." Also replace '### Simplified Version' with 'Chapter N' where N increments.""" # Split text into words words = text.split() processed_text = "" chapter_counter = 3 # Initialize chapter counter for i, word in enumerate(words): # Check if the word is a title (e.g., Mr., Mrs.) if word in TITLES: processed_text += word + " " # Check if the word ends with a period and is not followed by a quote elif word.endswith('.') and not word.endswith('."'): processed_text += word + "\n" # Replace '### Simplified Version' with 'Chapter N' elif word == "###" and i + 2 < len(words) and words[i + 1] == "Simplified" and words[i + 2] == "Version": processed_text += f"Chapter {chapter_counter} " chapter_counter += 1 # Increment chapter counter words[i + 1] = "" # Skip the next two words words[i + 2] = "" else: processed_text += word + " " # Remove trailing spaces and newlines return processed_text.strip() def combine_dataset_texts(dataset_name, split, text_column): try: # Load the dataset from Hugging Face Hub dataset = load_dataset(dataset_name, split=split) # Verify the text column exists if text_column not in dataset.column_names: raise gr.Error(f"Column '{text_column}' not found in dataset") # Combine all texts into a single string without separating datapoints combined_text = " ".join([example[text_column] for example in dataset]) # Clean the text: remove non-Latin and ** cleaned_text = clean_text(combined_text) # Process the text: insert newlines after periods, except for titles and ." # Also replace '### Simplified Version' with 'Chapter N' processed_text = process_text(cleaned_text) # Create a temporary file with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".txt") as f: f.write(processed_text) return f.name except Exception as e: raise gr.Error(f"Error processing dataset: {str(e)}") # Create Gradio interface with gr.Blocks() as demo: gr.Markdown("## Hugging Face Dataset Text Combiner") gr.Markdown("Combine all text files from a Hugging Face dataset into a single file") with gr.Row(): dataset_input = gr.Textbox(label="Dataset Name", placeholder="username/dataset-name") split_input = gr.Textbox(label="Split", value="train") column_input = gr.Textbox(label="Text Column", value="text") submit_btn = gr.Button("Combine Texts") with gr.Row(): output_file = gr.File(label="Combined Text File") error_out = gr.Textbox(label="Error Output", visible=False) submit_btn.click( fn=combine_dataset_texts, inputs=[dataset_input, split_input, column_input], outputs=output_file, api_name="combine_texts" ) if __name__ == "__main__": demo.launch()