import gradio as gr import numpy as np import pandas as pd import plotly.express as px from sklearn.decomposition import PCA from sentence_transformers import SentenceTransformer # Default values for identifiers and texts default_identifiers = [ "Dog_en", "Dog_simple_en", "Cat_en", "Cat_simple_en", "Train_en", "Train_simple_en", "Boat_en", "Boat_simple_en" ] default_texts = [ "The dog (Canis familiaris or Canis lupus familiaris) is a domesticated descendant of the wolf. Also called the domestic dog, it was domesticated from an extinct population of wolves during the Late Pleistocene, over 14,000 years ago by hunter-gatherers, prior to the development of agriculture. The dog was the first species to be domesticated by humans. Experts estimate that due to their long association with humans, dogs have expanded to a large number of domestic individuals and gained the ability to thrive on a starch-rich diet that would be inadequate for other canids.", "Dogs are mammals, usually kept as pets, for work on farms or for the police. Some dogs are trained to be rescue dogs and join teams such as mountain rescue. They have been bred by humans from their ancestral wolves. They were the first animals to live with humans", "The cat (Felis catus), also referred to as domestic cat or house cat, is a small domesticated carnivorous mammal. It is the only domesticated species of the family Felidae. Advances in archaeology and genetics have shown that the domestication of the cat occurred in the Near East around 7500 BC. It is commonly kept as a house pet and farm cat, but also ranges freely as a feral cat avoiding human contact. Valued by humans for companionship and its ability to kill vermin, the cat's retractable claws are adapted to killing small prey like mice and rats. It has a strong, flexible body, quick reflexes, and sharp teeth, and its night vision and sense of smell are well developed. It is a social species, but a solitary hunter and a crepuscular predator. Cat communication includes vocalizations like meowing, purring, trilling, hissing, growling, and grunting as well as cat body language. It can hear sounds too faint or too high in frequency for human ears, such as those made by small mammals. It secretes and perceives pheromones.", "Cats, also called domestic cats (Felis catus), are small, carnivorous (meat eating) mammals, of the family Felidae. Cats have been domesticated (tamed) for nearly 10,000 years.", 'A train (from Old French trahiner, from Latin trahere, "to pull, to draw") is a series of connected vehicles that run along a railway track and transport people or freight. Trains are typically pulled or pushed by locomotives (often known simply as "engines"), though some are self-propelled, such as multiple units or railcars. Passengers and cargo are carried in railroad cars, also known as wagons or carriages. Trains are designed to a certain gauge, or distance between rails. Most trains operate on steel tracks with steel wheels, the low friction of which makes them more efficient than other forms of transport.', "A rail train, otherwise referred to as simply a train, is a set of railway[a] cars (also called vehicles) that are tied together with or without a locomotive. Trains are used to carry people, and also things like raw material, finished goods, cargo, and waste. The vehicles that carry freight are called cars (in the United States) or wagons (in the United Kingdom). The ones that carry passengers are often called coaches or carriages. A place where a train stops to let people get on and off is called a train station or railway station.", "A boat is a watercraft of a large range of types and sizes, but generally smaller than a ship, which is distinguished by its larger size or capacity, its shape, or its ability to carry boats. Small boats are typically used on inland waterways such as rivers and lakes, or in protected coastal areas. However, some boats (such as whaleboats) were intended for offshore use. In modern naval terms, a boat is a vessel small enough to be carried aboard a ship.", "A boat is a vehicle used to travel on water. It is smaller than a ship and can be lifted out of the water and carried on a ship. Some boats have sails, some are powered by rowing with oars, and some use motors. Those that use steam engines are steamboats. There are some boats that can go underwater. They are called submarines. A narrowboat is a boat designed to be used on narrow canals. It is sometimes called a barge." ] # Load pre-trained sentence transformer model model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') # Function to compute document embeddings and apply PCA def compute_pca(data): # data is expected to be a list of dictionaries with 'Identifiant' and 'Texte' keys df = pd.DataFrame(data, columns=["Identifiant", "Texte"]) # Remove rows where 'Identifiant' or 'Texte' is empty or contains only whitespace valid_entries = df[ (df['Identifiant'].str.strip() != '') & (df['Texte'].str.strip() != '') ] if valid_entries.empty: return gr.Plot.update(value=None, label="Rien à analyser. Donnez moi du texte svp!") # Generate embeddings embeddings = model.encode(valid_entries['Texte'].tolist()) # Perform PCA to reduce to 2 dimensions pca = PCA(n_components=2) pca_result = pca.fit_transform(embeddings) # Add PCA results to the DataFrame valid_entries = valid_entries.reset_index(drop=True) valid_entries['PC1'] = pca_result[:, 0] valid_entries['PC2'] = pca_result[:, 1] # Plot the PCA result with identifiers as labels, with text positioned above the points fig = px.scatter( valid_entries, x='PC1', y='PC2', text='Identifiant', title='PCA des plongements sémantiques' ) # Update the text position to be above the points fig.update_traces(textposition='top center') return fig def text_editor_app(): with gr.Blocks() as demo: identifier_inputs = [] text_inputs = [] gr.Markdown("### Entrez au moins 2 textes et leurs identifiants:") for i in range(8): # Assuming we have 8 entries with gr.Column(): id_input = gr.Textbox(label=f"Identifiant {i+1}", value=default_identifiers[i]) text_input = gr.Textbox(label=f"Texte {i+1}", value=default_texts[i]) identifier_inputs.append(id_input) text_inputs.append(text_input) gr.Markdown("---") # Add a horizontal rule to create a break # Button to run the analysis analyze_button = gr.Button("Analyser") # Output plot output_plot = gr.Plot(label="Visualisation PCA ") # Function to collect inputs and process them def collect_inputs(*args): # args will be identifier1, text1, identifier2, text2, ..., identifier4, text4 # So we need to pair them up data = [] for i in range(0, len(args), 2): identifier = args[i] text = args[i+1] data.append([identifier, text]) return compute_pca(data) inputs = [] for id_input, text_input in zip(identifier_inputs, text_inputs): inputs.extend([id_input, text_input]) analyze_button.click(fn=collect_inputs, inputs=inputs, outputs=output_plot) return demo # Launch the app text_editor_app().launch()