Spaces:

Quantified
/

conversation-rebuilder

Runtime error

App Files Files Community

sergiomar73 commited on Mar 19, 2024

Commit

d3b2949

1 Parent(s): eba7dd5

Basic implementation ok

Browse files

Files changed (3) hide show

README.md +4 -6
app.py +125 -151
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -1,13 +1,11 @@
 ---
-title: Rebuild Conversation
-emoji: 👁
 colorFrom: yellow
-colorTo: green
 sdk: gradio
 sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 python_version: 3.9.13
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Conversation rebuilder
+emoji: 🗣️
 colorFrom: yellow
+colorTo: orange
 sdk: gradio
 sdk_version: 4.21.0
 app_file: app.py
 pinned: false
 python_version: 3.9.13
+---

app.py CHANGED Viewed

@@ -1,153 +1,127 @@
-# import csv
-# import datetime
-# import gradio as gr
-# import pandas as pd
-# from io import BytesIO
-# from pathlib import Path
-# from urllib.parse import urlparse
-# def format_seconds(secs):
-#     t = datetime.datetime(
-#         year=1, month=1, day=1, hour=0, minute=0
-#     ) + datetime.timedelta(seconds=secs)
-#     return t.strftime("%M:%S.%f")[:-3]
-# def get_filename_and_extension(url):
-#     parsed_url = urlparse(url)
-#     path = parsed_url.path
-#     filename = Path(path).name
-#     filename_without_extension = Path(filename).stem
-#     file_extension = Path(filename).suffix
-#     return filename, filename_without_extension, file_extension
-# def calculate_times(input_url, input_text, ms_before, ms_after):
-#     _, _, file_extension = get_filename_and_extension(input_url)
-#     file_extension = file_extension.replace(".", "")
-#     df = pd.DataFrame({"text": [], "start": [], "stop": [], "file": []})
-#     lines = input_text.splitlines()
-#     segments = []
-#     if len(lines) != len(segments):
-#         msg = f"DETECTED CLIPS AND INPUT LINES DO NOT MATCH!\n\nYou are expecting {len(lines)} clips BUT {len(segments)} segments have been found in the video file."
-#         df.loc[len(df.index)] = ["", "", "", ""]
-#         return msg, None, df
-#     else:
-#         res = []
-#         for i in range(len(segments)):
-#             line = lines[i].rstrip()
-#             res.append(f"{line}\t{segments[i][0]}\t{segments[i][1]}\t{input_url}")
-#             df.loc[len(df.index)] = [line, segments[i][0], segments[i][1], input_url]
-#         df.to_csv(
-#             "clips.tsv",
-#             sep="\t",
-#             encoding="utf-8",
-#             index=False,
-#             header=False,
-#             quoting=csv.QUOTE_NONE,
-#         )
-#         return "\n".join(res), "clips.tsv", df
-# def load_video(input_url):
-#     if input_url:
-#         return input_url
-#     return None
-# css = """
-# .required {background-color: #FFCCCB !important, font-size: 24px !important}
-# """
-# with gr.Blocks(title="Start and stop times", css=css) as app:
-#     gr.Markdown(
-#         """# Start and stop times generator
-#     Please, fill the Video URL and Clip texts textboxes and click the Run button"""
-#     )
-#     with gr.Row():
-#         with gr.Column(scale=3):
-#             text1 = gr.Textbox(
-#                 lines=1,
-#                 placeholder="Video URL...",
-#                 label="Video URL",
-#                 elem_classes=["required"],
-#             )
-#             text2 = gr.Textbox(
-#                 lines=5,
-#                 max_lines=10,
-#                 placeholder="List of clip texts...",
-#                 label="Clip texts",
-#                 elem_classes=["required"],
-#             )
-#             slider1 = gr.Slider(
-#                 minimum=0,
-#                 maximum=1000,
-#                 step=50,
-#                 value=0,
-#                 label="Milliseconds BEFORE each clip",
-#             )
-#             slider2 = gr.Slider(
-#                 minimum=0,
-#                 maximum=1000,
-#                 step=50,
-#                 value=500,
-#                 label="Milliseconds AFTER each clip",
-#             )
-#             btn_submit = gr.Button(value="Run", variant="primary", size="sm")
-#             video = gr.Video(
-#                 format="mp4", label="Video file", show_label=True, interactive=False
-#             )
-#         with gr.Column(scale=5):
-#             file = gr.File(
-#                 label="Clips", show_label=True, file_count=1, interactive=False
-#             )
-#             lines = gr.Textbox(
-#                 lines=10, label="Clips", interactive=False, show_copy_button=True
-#             )
-#             data = gr.Dataframe(
-#                 label="Clips",
-#                 headers=["text", "start", "stop", "file"],
-#                 datatype=["str", "str", "str", "str"],
-#                 row_count=0,
-#             )
-#     btn_submit.click(
-#         calculate_times,
-#         inputs=[text1, text2, slider1, slider2],
-#         outputs=[lines, file, data],
-#     )
-#     text1.blur(load_video, inputs=[text1], outputs=[video])
-# app.launch()
-import gradio as gr
-def diff_texts(text1, text2):
-    return text1 + ' ' +text2
-app = gr.Interface(
-    diff_texts,
-    [
-        gr.Textbox(
-            label="Text 1",
-            info="Initial text",
-            lines=3,
-            value="The quick brown fox jumped over the lazy dogs.",
-        ),
-        gr.Textbox(
-            label="Text 2",
-            info="Text to compare",
-            lines=3,
-            value="The fast brown fox jumps over lazy dogs.",
-        ),
-    ],
-    gr.Textbox(
-        label="Sum",
-    ),
-    theme=gr.themes.Base()
-)
 app.launch()

+import csv
+import gradio as gr # type: ignore
+import pandas as pd # type: ignore
+with gr.Blocks(title="Conversation rebuilder") as app:
+    gr.Markdown(
+    """# Conversation rebuilder
+    Please, fill the Database Transcript and the List of matched clips from Kibana, and click the Rebuild button"""
+    )
+    with gr.Row():
+        txt_transcript = gr.Code(
+            label="Database Transcript",
+            interactive=True,
+            lines=5,
+        )
+        txt_clips = gr.Code(
+            label="Kibana clips",
+            interactive=True,
+            lines=5,
+        )
+    with gr.Row():
+        gr.ClearButton(value="Clear", variant="secondary", size="sm", components=[txt_transcript, txt_clips])
+        btn_build = gr.Button(value="Rebuild", variant="primary", size="sm")
+    with gr.Row():
+        data = gr.Dataframe(
+            label="CONVERSATION",
+            headers=["index", "user", "agent", "gpt", "distance"],
+            datatype=["str", "str", "str", "str", "number"],
+            column_widths=["8%","29%","29%","29%","5%"],
+            # row_count=(1, "fixed"),
+            col_count=(5, "fixed"),
+            interactive=False,
+            wrap=True,
+        )
+    with gr.Row():
+        file = gr.File(
+            label="Export files",
+            show_label=True,
+            height=60,
+            container=True,
+            interactive=False,
+            file_count="single",
+        )
+    COL_TIMESTAMP = 0
+    COL_CONVERSATION_ID = 1
+    COL_CLIP_COLLECTION_ID = 2
+    COL_REQUEST_ID = 2
+    COL_SENTENCE_INDEX = 4
+    COL_SENTENCE_ORIGINAL = 5
+    COL_CLIP_TEXT = 6
+    COL_CLIP_ID = 7
+    COL_DISTANCE = 8
+    def find_clips_matched(agent_text,clips):
+        clip_list = clips.splitlines()
+        for clip in clip_list:
+            parts = clip.strip().split('\t')
+            if parts[COL_CLIP_TEXT] == agent_text:
+                return \
+                    parts[COL_SENTENCE_ORIGINAL], \
+                    int(parts[COL_SENTENCE_INDEX]), \
+                    round(float(parts[COL_DISTANCE]),2)
+    @btn_build.click(inputs=[txt_transcript,txt_clips], outputs=[data, file])
+    def rebuild_conversation(transcript, clips):
+        df = pd.DataFrame({"index": [], "user": [], "agent": [], "gpt": [], "distance": []})
+        if not transcript.strip() or not clips.strip():
+            msg = f"EMPTY TRANSCRIPT OR LIST OF CLIPS!"
+            df.loc[len(df.index)] = ["", msg, "", "", ""]
+            return df, None
+        lines = transcript.splitlines()
+        user_text = ""
+        conversation_line = 1
+        for i in range(len(lines)):
+            line = lines[i].strip()
+            if line:
+                if line.startswith("user:"):
+                    user_text = line.replace("user:","").strip()
+                    conversation_line = conversation_line + 1
+                elif line.startswith("agent:"):
+                    agent_text = line.replace("agent:","").strip()
+                    gpt, order, distance = find_clips_matched(agent_text,clips)
+                    index = f"{str(conversation_line).zfill(3)}-{str(order).zfill(2)}"
+                    df.loc[len(df.index)] = [index, user_text, agent_text, gpt, distance]
+                    user_text = ""
+        # Get conversation ID
+        clip_list = clips.splitlines()
+        parts = clip_list[0].strip().split('\t')
+        conversation_id = parts[COL_CONVERSATION_ID]
+        tsv_file_name = f'conversation-{conversation_id}.tsv'
+        excel_file_name = f'conversation-{conversation_id}.xlsx'
+        # Build TSV file
+        df.to_csv(
+            tsv_file_name,
+            sep="\t",
+            encoding="utf-8",
+            index=False,
+            header=True,
+            quoting=csv.QUOTE_ALL,
+        )
+        # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.copy.html
+        df_excel = df.copy(deep=True)
+        # https://blog.enterprisedna.co/pandas-drop-index/
+        df_excel.set_index('index', inplace=True)
+        # Build Excel file
+        # https://xlsxwriter.readthedocs.io/working_with_pandas.html
+        # Create a Pandas Excel writer using XlsxWriter as the engine.
+        writer = pd.ExcelWriter(excel_file_name, engine='xlsxwriter')
+        # Convert the dataframe to an XlsxWriter Excel object.
+        df_excel.to_excel(writer, sheet_name='Conversation')
+        # Get the xlsxwriter workbook and worksheet objects.
+        workbook = writer.book
+        worksheet = writer.sheets["Conversation"]
+        # https://xlsxwriter.readthedocs.io/format.html#number-formats-in-different-locales
+        number_format = workbook.add_format({'num_format': '#,##0.00'})
+        text_format = workbook.add_format({'text_wrap': True})
+        # Set the columns widths.
+        worksheet.set_column("B:D", 50, text_format)
+        worksheet.set_column('E:E', 8, number_format)
+        # Autofit the worksheet.
+        worksheet.autofit()
+        # Close the Pandas Excel writer and output the Excel file.
+        writer.close()
+        return df, [excel_file_name,tsv_file_name]
 app.launch()

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- gradio


1	+ gradio
2	+ xlsxwriter