sergiomar73 commited on
Commit
d3b2949
·
1 Parent(s): eba7dd5

Basic implementation ok

Browse files
Files changed (3) hide show
  1. README.md +4 -6
  2. app.py +125 -151
  3. requirements.txt +2 -1
README.md CHANGED
@@ -1,13 +1,11 @@
1
  ---
2
- title: Rebuild Conversation
3
- emoji: 👁
4
  colorFrom: yellow
5
- colorTo: green
6
  sdk: gradio
7
  sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: false
10
  python_version: 3.9.13
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Conversation rebuilder
3
+ emoji: 🗣️
4
  colorFrom: yellow
5
+ colorTo: orange
6
  sdk: gradio
7
  sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: false
10
  python_version: 3.9.13
11
+ ---
 
 
app.py CHANGED
@@ -1,153 +1,127 @@
1
- # import csv
2
- # import datetime
3
- # import gradio as gr
4
- # import pandas as pd
5
- # from io import BytesIO
6
- # from pathlib import Path
7
- # from urllib.parse import urlparse
8
-
9
-
10
- # def format_seconds(secs):
11
- # t = datetime.datetime(
12
- # year=1, month=1, day=1, hour=0, minute=0
13
- # ) + datetime.timedelta(seconds=secs)
14
- # return t.strftime("%M:%S.%f")[:-3]
15
-
16
-
17
- # def get_filename_and_extension(url):
18
- # parsed_url = urlparse(url)
19
- # path = parsed_url.path
20
- # filename = Path(path).name
21
- # filename_without_extension = Path(filename).stem
22
- # file_extension = Path(filename).suffix
23
- # return filename, filename_without_extension, file_extension
24
-
25
-
26
- # def calculate_times(input_url, input_text, ms_before, ms_after):
27
- # _, _, file_extension = get_filename_and_extension(input_url)
28
- # file_extension = file_extension.replace(".", "")
29
- # df = pd.DataFrame({"text": [], "start": [], "stop": [], "file": []})
30
- # lines = input_text.splitlines()
31
- # segments = []
32
- # if len(lines) != len(segments):
33
- # msg = f"DETECTED CLIPS AND INPUT LINES DO NOT MATCH!\n\nYou are expecting {len(lines)} clips BUT {len(segments)} segments have been found in the video file."
34
- # df.loc[len(df.index)] = ["", "", "", ""]
35
- # return msg, None, df
36
- # else:
37
- # res = []
38
- # for i in range(len(segments)):
39
- # line = lines[i].rstrip()
40
- # res.append(f"{line}\t{segments[i][0]}\t{segments[i][1]}\t{input_url}")
41
- # df.loc[len(df.index)] = [line, segments[i][0], segments[i][1], input_url]
42
- # df.to_csv(
43
- # "clips.tsv",
44
- # sep="\t",
45
- # encoding="utf-8",
46
- # index=False,
47
- # header=False,
48
- # quoting=csv.QUOTE_NONE,
49
- # )
50
- # return "\n".join(res), "clips.tsv", df
51
-
52
-
53
- # def load_video(input_url):
54
- # if input_url:
55
- # return input_url
56
- # return None
57
-
58
-
59
- # css = """
60
- # .required {background-color: #FFCCCB !important, font-size: 24px !important}
61
- # """
62
-
63
-
64
- # with gr.Blocks(title="Start and stop times", css=css) as app:
65
- # gr.Markdown(
66
- # """# Start and stop times generator
67
- # Please, fill the Video URL and Clip texts textboxes and click the Run button"""
68
- # )
69
- # with gr.Row():
70
- # with gr.Column(scale=3):
71
- # text1 = gr.Textbox(
72
- # lines=1,
73
- # placeholder="Video URL...",
74
- # label="Video URL",
75
- # elem_classes=["required"],
76
- # )
77
- # text2 = gr.Textbox(
78
- # lines=5,
79
- # max_lines=10,
80
- # placeholder="List of clip texts...",
81
- # label="Clip texts",
82
- # elem_classes=["required"],
83
- # )
84
- # slider1 = gr.Slider(
85
- # minimum=0,
86
- # maximum=1000,
87
- # step=50,
88
- # value=0,
89
- # label="Milliseconds BEFORE each clip",
90
- # )
91
- # slider2 = gr.Slider(
92
- # minimum=0,
93
- # maximum=1000,
94
- # step=50,
95
- # value=500,
96
- # label="Milliseconds AFTER each clip",
97
- # )
98
- # btn_submit = gr.Button(value="Run", variant="primary", size="sm")
99
- # video = gr.Video(
100
- # format="mp4", label="Video file", show_label=True, interactive=False
101
- # )
102
- # with gr.Column(scale=5):
103
- # file = gr.File(
104
- # label="Clips", show_label=True, file_count=1, interactive=False
105
- # )
106
- # lines = gr.Textbox(
107
- # lines=10, label="Clips", interactive=False, show_copy_button=True
108
- # )
109
- # data = gr.Dataframe(
110
- # label="Clips",
111
- # headers=["text", "start", "stop", "file"],
112
- # datatype=["str", "str", "str", "str"],
113
- # row_count=0,
114
- # )
115
- # btn_submit.click(
116
- # calculate_times,
117
- # inputs=[text1, text2, slider1, slider2],
118
- # outputs=[lines, file, data],
119
- # )
120
- # text1.blur(load_video, inputs=[text1], outputs=[video])
121
-
122
- # app.launch()
123
-
124
- import gradio as gr
125
-
126
-
127
- def diff_texts(text1, text2):
128
- return text1 + ' ' +text2
129
-
130
-
131
- app = gr.Interface(
132
- diff_texts,
133
- [
134
- gr.Textbox(
135
- label="Text 1",
136
- info="Initial text",
137
- lines=3,
138
- value="The quick brown fox jumped over the lazy dogs.",
139
- ),
140
- gr.Textbox(
141
- label="Text 2",
142
- info="Text to compare",
143
- lines=3,
144
- value="The fast brown fox jumps over lazy dogs.",
145
- ),
146
- ],
147
- gr.Textbox(
148
- label="Sum",
149
- ),
150
- theme=gr.themes.Base()
151
- )
152
 
153
  app.launch()
 
1
+ import csv
2
+ import gradio as gr # type: ignore
3
+ import pandas as pd # type: ignore
4
+
5
+ with gr.Blocks(title="Conversation rebuilder") as app:
6
+ gr.Markdown(
7
+ """# Conversation rebuilder
8
+ Please, fill the Database Transcript and the List of matched clips from Kibana, and click the Rebuild button"""
9
+ )
10
+ with gr.Row():
11
+ txt_transcript = gr.Code(
12
+ label="Database Transcript",
13
+ interactive=True,
14
+ lines=5,
15
+ )
16
+ txt_clips = gr.Code(
17
+ label="Kibana clips",
18
+ interactive=True,
19
+ lines=5,
20
+ )
21
+ with gr.Row():
22
+ gr.ClearButton(value="Clear", variant="secondary", size="sm", components=[txt_transcript, txt_clips])
23
+ btn_build = gr.Button(value="Rebuild", variant="primary", size="sm")
24
+ with gr.Row():
25
+ data = gr.Dataframe(
26
+ label="CONVERSATION",
27
+ headers=["index", "user", "agent", "gpt", "distance"],
28
+ datatype=["str", "str", "str", "str", "number"],
29
+ column_widths=["8%","29%","29%","29%","5%"],
30
+ # row_count=(1, "fixed"),
31
+ col_count=(5, "fixed"),
32
+ interactive=False,
33
+ wrap=True,
34
+ )
35
+ with gr.Row():
36
+ file = gr.File(
37
+ label="Export files",
38
+ show_label=True,
39
+ height=60,
40
+ container=True,
41
+ interactive=False,
42
+ file_count="single",
43
+ )
44
+
45
+ COL_TIMESTAMP = 0
46
+ COL_CONVERSATION_ID = 1
47
+ COL_CLIP_COLLECTION_ID = 2
48
+ COL_REQUEST_ID = 2
49
+ COL_SENTENCE_INDEX = 4
50
+ COL_SENTENCE_ORIGINAL = 5
51
+ COL_CLIP_TEXT = 6
52
+ COL_CLIP_ID = 7
53
+ COL_DISTANCE = 8
54
+
55
+ def find_clips_matched(agent_text,clips):
56
+ clip_list = clips.splitlines()
57
+ for clip in clip_list:
58
+ parts = clip.strip().split('\t')
59
+ if parts[COL_CLIP_TEXT] == agent_text:
60
+ return \
61
+ parts[COL_SENTENCE_ORIGINAL], \
62
+ int(parts[COL_SENTENCE_INDEX]), \
63
+ round(float(parts[COL_DISTANCE]),2)
64
+
65
+ @btn_build.click(inputs=[txt_transcript,txt_clips], outputs=[data, file])
66
+ def rebuild_conversation(transcript, clips):
67
+ df = pd.DataFrame({"index": [], "user": [], "agent": [], "gpt": [], "distance": []})
68
+ if not transcript.strip() or not clips.strip():
69
+ msg = f"EMPTY TRANSCRIPT OR LIST OF CLIPS!"
70
+ df.loc[len(df.index)] = ["", msg, "", "", ""]
71
+ return df, None
72
+ lines = transcript.splitlines()
73
+ user_text = ""
74
+ conversation_line = 1
75
+ for i in range(len(lines)):
76
+ line = lines[i].strip()
77
+ if line:
78
+ if line.startswith("user:"):
79
+ user_text = line.replace("user:","").strip()
80
+ conversation_line = conversation_line + 1
81
+ elif line.startswith("agent:"):
82
+ agent_text = line.replace("agent:","").strip()
83
+ gpt, order, distance = find_clips_matched(agent_text,clips)
84
+ index = f"{str(conversation_line).zfill(3)}-{str(order).zfill(2)}"
85
+ df.loc[len(df.index)] = [index, user_text, agent_text, gpt, distance]
86
+ user_text = ""
87
+ # Get conversation ID
88
+ clip_list = clips.splitlines()
89
+ parts = clip_list[0].strip().split('\t')
90
+ conversation_id = parts[COL_CONVERSATION_ID]
91
+ tsv_file_name = f'conversation-{conversation_id}.tsv'
92
+ excel_file_name = f'conversation-{conversation_id}.xlsx'
93
+ # Build TSV file
94
+ df.to_csv(
95
+ tsv_file_name,
96
+ sep="\t",
97
+ encoding="utf-8",
98
+ index=False,
99
+ header=True,
100
+ quoting=csv.QUOTE_ALL,
101
+ )
102
+ # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.copy.html
103
+ df_excel = df.copy(deep=True)
104
+ # https://blog.enterprisedna.co/pandas-drop-index/
105
+ df_excel.set_index('index', inplace=True)
106
+ # Build Excel file
107
+ # https://xlsxwriter.readthedocs.io/working_with_pandas.html
108
+ # Create a Pandas Excel writer using XlsxWriter as the engine.
109
+ writer = pd.ExcelWriter(excel_file_name, engine='xlsxwriter')
110
+ # Convert the dataframe to an XlsxWriter Excel object.
111
+ df_excel.to_excel(writer, sheet_name='Conversation')
112
+ # Get the xlsxwriter workbook and worksheet objects.
113
+ workbook = writer.book
114
+ worksheet = writer.sheets["Conversation"]
115
+ # https://xlsxwriter.readthedocs.io/format.html#number-formats-in-different-locales
116
+ number_format = workbook.add_format({'num_format': '#,##0.00'})
117
+ text_format = workbook.add_format({'text_wrap': True})
118
+ # Set the columns widths.
119
+ worksheet.set_column("B:D", 50, text_format)
120
+ worksheet.set_column('E:E', 8, number_format)
121
+ # Autofit the worksheet.
122
+ worksheet.autofit()
123
+ # Close the Pandas Excel writer and output the Excel file.
124
+ writer.close()
125
+ return df, [excel_file_name,tsv_file_name]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
  app.launch()
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- gradio
 
 
1
+ gradio
2
+ xlsxwriter