Spaces:
Running
Running
navalnica
commited on
Commit
·
1e32511
1
Parent(s):
659f5b7
improve split prompt
Browse files- data/samples_to_split.py +45 -0
- pg.ipynb +82 -43
- readme.md +12 -0
- prompts.py → src/prompts.py +52 -2
- src/text_split_chain.py +102 -0
- src/utils.py +17 -0
- utils.py +0 -49
data/samples_to_split.py
CHANGED
@@ -25,6 +25,37 @@ bed, and lay down stiffly—was instantly asleep.
|
|
25 |
"""
|
26 |
|
27 |
GATSBY_2 = """\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
28 |
“If you’ll get up.”
|
29 |
|
30 |
“I will. Good night, Mr. Carraway. See you anon.”
|
@@ -50,4 +81,18 @@ of weekends out here this summer. I think the home influence will be
|
|
50 |
very good for her.”
|
51 |
|
52 |
Daisy and Tom looked at each other for a moment in silence.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
"""
|
|
|
25 |
"""
|
26 |
|
27 |
GATSBY_2 = """\
|
28 |
+
Inside, the crimson room bloomed with light. Tom and Miss Baker sat at
|
29 |
+
either end of the long couch and she read aloud to him from the
|
30 |
+
Saturday Evening Post—the words, murmurous and uninflected, running
|
31 |
+
together in a soothing tune. The lamplight, bright on his boots and
|
32 |
+
dull on the autumn-leaf yellow of her hair, glinted along the paper as
|
33 |
+
she turned a page with a flutter of slender muscles in her arms.
|
34 |
+
|
35 |
+
When we came in she held us silent for a moment with a lifted hand.
|
36 |
+
|
37 |
+
“To be continued,” she said, tossing the magazine on the table, “in
|
38 |
+
our very next issue.”
|
39 |
+
|
40 |
+
Her body asserted itself with a restless movement of her knee, and she
|
41 |
+
stood up.
|
42 |
+
|
43 |
+
“Ten o’clock,” she remarked, apparently finding the time on the
|
44 |
+
ceiling. “Time for this good girl to go to bed.”
|
45 |
+
|
46 |
+
“Jordan’s going to play in the tournament tomorrow,” explained Daisy,
|
47 |
+
“over at Westchester.”
|
48 |
+
|
49 |
+
“Oh—you’re Jordan Baker.”
|
50 |
+
|
51 |
+
I knew now why her face was familiar—its pleasing contemptuous
|
52 |
+
expression had looked out at me from many rotogravure pictures of the
|
53 |
+
sporting life at Asheville and Hot Springs and Palm Beach. I had heard
|
54 |
+
some story of her too, a critical, unpleasant story, but what it was I
|
55 |
+
had forgotten long ago.
|
56 |
+
|
57 |
+
“Good night,” she said softly. “Wake me at eight, won’t you.”
|
58 |
+
|
59 |
“If you’ll get up.”
|
60 |
|
61 |
“I will. Good night, Mr. Carraway. See you anon.”
|
|
|
81 |
very good for her.”
|
82 |
|
83 |
Daisy and Tom looked at each other for a moment in silence.
|
84 |
+
|
85 |
+
“Is she from New York?” I asked quickly.
|
86 |
+
|
87 |
+
“From Louisville. Our white girlhood was passed together there. Our
|
88 |
+
beautiful white—”
|
89 |
+
|
90 |
+
“Did you give Nick a little heart to heart talk on the veranda?”
|
91 |
+
demanded Tom suddenly.
|
92 |
+
|
93 |
+
“Did I?” She looked at me. “I can’t seem to remember, but I think we
|
94 |
+
talked about the Nordic race. Yes, I’m sure we did. It sort of crept
|
95 |
+
up on us and first thing you know—”
|
96 |
+
|
97 |
+
“Don’t believe everything you hear, Nick,” he advised me.
|
98 |
"""
|
pg.ipynb
CHANGED
@@ -50,86 +50,125 @@
|
|
50 |
"outputs": [],
|
51 |
"source": [
|
52 |
"import data.samples_to_split as samples\n",
|
53 |
-
"from utils import GPTModels,
|
|
|
54 |
]
|
55 |
},
|
56 |
{
|
57 |
"cell_type": "code",
|
58 |
-
"execution_count":
|
59 |
"metadata": {},
|
60 |
-
"outputs": [
|
61 |
-
{
|
62 |
-
"name": "stdout",
|
63 |
-
"output_type": "stream",
|
64 |
-
"text": [
|
65 |
-
"characters: ['narrator', 'Mr. Carraway', 'Daisy', 'Miss Baker', 'Tom', 'Nick']\n",
|
66 |
-
"[narrator] “If you’ll get up.”\n",
|
67 |
-
"[Mr. Carraway] “I will. Good night, Mr. Carraway. See you anon.”\n",
|
68 |
-
"[Daisy] “Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”\n",
|
69 |
-
"[Miss Baker] “Good night,” called Miss Baker from the stairs. “I haven’t heard a word.”\n",
|
70 |
-
"[Tom] “She’s a nice girl,” said Tom after a moment. “They oughtn’t to let her run around the country this way.”\n",
|
71 |
-
"[Daisy] “Who oughtn’t to?” inquired Daisy coldly.\n",
|
72 |
-
"[narrator] “Her family.”\n",
|
73 |
-
"[narrator] “Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\n",
|
74 |
-
"[narrator] Daisy and Tom looked at each other for a moment in silence.\n"
|
75 |
-
]
|
76 |
-
}
|
77 |
-
],
|
78 |
"source": [
|
79 |
-
"chain =
|
|
|
80 |
"with get_openai_callback() as cb:\n",
|
81 |
-
" res = chain.invoke({'text': samples.GATSBY_2})
|
82 |
-
"print(res.to_pretty_text())"
|
83 |
]
|
84 |
},
|
85 |
{
|
86 |
"cell_type": "code",
|
87 |
-
"execution_count":
|
88 |
"metadata": {},
|
89 |
"outputs": [
|
90 |
{
|
91 |
"data": {
|
92 |
"text/plain": [
|
93 |
-
"
|
94 |
-
"\tPrompt Tokens: 1253\n",
|
95 |
-
"\tCompletion Tokens: 326\n",
|
96 |
-
"Successful Requests: 1\n",
|
97 |
-
"Total Cost (USD): $0.02231"
|
98 |
]
|
99 |
},
|
100 |
-
"execution_count":
|
101 |
"metadata": {},
|
102 |
"output_type": "execute_result"
|
103 |
}
|
104 |
],
|
105 |
"source": [
|
106 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
107 |
]
|
108 |
},
|
109 |
{
|
110 |
"cell_type": "code",
|
111 |
-
"execution_count":
|
112 |
"metadata": {},
|
113 |
"outputs": [
|
114 |
{
|
115 |
"name": "stdout",
|
116 |
"output_type": "stream",
|
117 |
"text": [
|
118 |
-
"
|
119 |
-
"
|
120 |
-
"
|
121 |
-
"
|
122 |
-
"
|
123 |
-
"
|
124 |
-
"
|
125 |
-
"[narrator] “Her family.”\n",
|
126 |
-
"[narrator] “Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\n",
|
127 |
-
"[narrator] Daisy and Tom looked at each other for a moment in silence.\n"
|
128 |
]
|
129 |
}
|
130 |
],
|
131 |
"source": [
|
132 |
-
"print(
|
133 |
]
|
134 |
},
|
135 |
{
|
|
|
50 |
"outputs": [],
|
51 |
"source": [
|
52 |
"import data.samples_to_split as samples\n",
|
53 |
+
"from src.utils import GPTModels\n",
|
54 |
+
"from src.text_split_chain import create_split_text_chain_v2"
|
55 |
]
|
56 |
},
|
57 |
{
|
58 |
"cell_type": "code",
|
59 |
+
"execution_count": 4,
|
60 |
"metadata": {},
|
61 |
+
"outputs": [],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
"source": [
|
63 |
+
"chain = create_split_text_chain_v2(llm_model=GPTModels.GPT_4o)\n",
|
64 |
+
"# chain = create_split_text_chain_v2(llm_model=GPTModels.GPT_4_TURBO_2024_04_09)\n",
|
65 |
"with get_openai_callback() as cb:\n",
|
66 |
+
" res = chain.invoke({'text': samples.GATSBY_2})"
|
|
|
67 |
]
|
68 |
},
|
69 |
{
|
70 |
"cell_type": "code",
|
71 |
+
"execution_count": 5,
|
72 |
"metadata": {},
|
73 |
"outputs": [
|
74 |
{
|
75 |
"data": {
|
76 |
"text/plain": [
|
77 |
+
"SplitTextOutputV2(text_raw='<narrator>Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.</narrator>\\n\\n<narrator>When we came in she held us silent for a moment with a lifted hand.</narrator>\\n\\n<Jordan>“To be continued,”</Jordan> <narrator>she said, tossing the magazine on the table,</narrator> <Jordan>“in our very next issue.”</Jordan>\\n\\n<narrator>Her body asserted itself with a restless movement of her knee, and she stood up.</narrator>\\n\\n<Jordan>“Ten o’clock,”</Jordan> <narrator>she remarked, apparently finding the time on the ceiling.</narrator> <Jordan>“Time for this good girl to go to bed.”</Jordan>\\n\\n<Daisy>“Jordan’s going to play in the tournament tomorrow,”</Daisy> <narrator>explained Daisy,</narrator> <Daisy>“over at Westchester.”</Daisy>\\n\\n<narrator>“Oh—you’re Jordan Baker.”</narrator>\\n\\n<narrator>I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.</narrator>\\n\\n<Jordan>“Good night,”</Jordan> <narrator>she said softly.</narrator> <Jordan>“Wake me at eight, won’t you.”</Jordan>\\n\\n<c1>“If you’ll get up.”</c1>\\n\\n<Jordan>“I will. Good night, Mr. Carraway. See you anon.”</Jordan>\\n\\n<Daisy>“Of course you will,”</Daisy> <narrator>confirmed Daisy.</narrator> <Daisy>“In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”</Daisy>\\n\\n<Jordan>“Good night,”</Jordan> <narrator>called Miss Baker from the stairs.</narrator> <Jordan>“I haven’t heard a word.”</Jordan>\\n\\n<Tom>“She’s a nice girl,”</Tom> <narrator>said Tom after a moment.</narrator> <Tom>“They oughtn’t to let her run around the country this way.”</Tom>\\n\\n<Daisy>“Who oughtn’t to?”</Daisy> <narrator>inquired Daisy coldly.</narrator>\\n\\n<Tom>“Her family.”</Tom>\\n\\n<Daisy>“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”</Daisy>\\n\\n<narrator>Daisy and Tom looked at each other for a moment in silence.</narrator>\\n\\n<narrator>“Is she from New York?”</narrator> <narrator>I asked quickly.</narrator>\\n\\n<Daisy>“From Louisville. Our white girlhood was passed together there. Our beautiful white—”</Daisy>\\n\\n<Tom>“Did you give Nick a little heart to heart talk on the veranda?”</Tom> <narrator>demanded Tom suddenly.</narrator>\\n\\n<Daisy>“Did I?”</Daisy> <narrator>She looked at me.</narrator> <Daisy>“I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”</Daisy>\\n\\n<Tom>“Don’t believe everything you hear, Nick,”</Tom> <narrator>he advised me.</narrator>')"
|
|
|
|
|
|
|
|
|
78 |
]
|
79 |
},
|
80 |
+
"execution_count": 5,
|
81 |
"metadata": {},
|
82 |
"output_type": "execute_result"
|
83 |
}
|
84 |
],
|
85 |
"source": [
|
86 |
+
"res"
|
87 |
+
]
|
88 |
+
},
|
89 |
+
{
|
90 |
+
"cell_type": "code",
|
91 |
+
"execution_count": 6,
|
92 |
+
"metadata": {},
|
93 |
+
"outputs": [
|
94 |
+
{
|
95 |
+
"name": "stdout",
|
96 |
+
"output_type": "stream",
|
97 |
+
"text": [
|
98 |
+
"characters: ['c1', 'Daisy', 'Jordan', 'Tom', 'narrator']\n",
|
99 |
+
"--------------------\n",
|
100 |
+
"[narrator] Inside, the crimson room bloomed with light. Tom and Miss Baker sat at either end of the long couch and she read aloud to him from the Saturday Evening Post—the words, murmurous and uninflected, running together in a soothing tune. The lamplight, bright on his boots and dull on the autumn-leaf yellow of her hair, glinted along the paper as she turned a page with a flutter of slender muscles in her arms.\n",
|
101 |
+
"[narrator] When we came in she held us silent for a moment with a lifted hand.\n",
|
102 |
+
"[Jordan] “To be continued,”\n",
|
103 |
+
"[narrator] she said, tossing the magazine on the table,\n",
|
104 |
+
"[Jordan] “in our very next issue.”\n",
|
105 |
+
"[narrator] Her body asserted itself with a restless movement of her knee, and she stood up.\n",
|
106 |
+
"[Jordan] “Ten o’clock,”\n",
|
107 |
+
"[narrator] she remarked, apparently finding the time on the ceiling.\n",
|
108 |
+
"[Jordan] “Time for this good girl to go to bed.”\n",
|
109 |
+
"[Daisy] “Jordan’s going to play in the tournament tomorrow,”\n",
|
110 |
+
"[narrator] explained Daisy,\n",
|
111 |
+
"[Daisy] “over at Westchester.”\n",
|
112 |
+
"[narrator] “Oh—you’re Jordan Baker.”\n",
|
113 |
+
"[narrator] I knew now why her face was familiar—its pleasing contemptuous expression had looked out at me from many rotogravure pictures of the sporting life at Asheville and Hot Springs and Palm Beach. I had heard some story of her too, a critical, unpleasant story, but what it was I had forgotten long ago.\n",
|
114 |
+
"[Jordan] “Good night,”\n",
|
115 |
+
"[narrator] she said softly.\n",
|
116 |
+
"[Jordan] “Wake me at eight, won’t you.”\n",
|
117 |
+
"[c1] “If you’ll get up.”\n",
|
118 |
+
"[Jordan] “I will. Good night, Mr. Carraway. See you anon.”\n",
|
119 |
+
"[Daisy] “Of course you will,”\n",
|
120 |
+
"[narrator] confirmed Daisy.\n",
|
121 |
+
"[Daisy] “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”\n",
|
122 |
+
"[Jordan] “Good night,”\n",
|
123 |
+
"[narrator] called Miss Baker from the stairs.\n",
|
124 |
+
"[Jordan] “I haven’t heard a word.”\n",
|
125 |
+
"[Tom] “She’s a nice girl,”\n",
|
126 |
+
"[narrator] said Tom after a moment.\n",
|
127 |
+
"[Tom] “They oughtn’t to let her run around the country this way.”\n",
|
128 |
+
"[Daisy] “Who oughtn’t to?”\n",
|
129 |
+
"[narrator] inquired Daisy coldly.\n",
|
130 |
+
"[Tom] “Her family.”\n",
|
131 |
+
"[Daisy] “Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”\n",
|
132 |
+
"[narrator] Daisy and Tom looked at each other for a moment in silence.\n",
|
133 |
+
"[narrator] “Is she from New York?”\n",
|
134 |
+
"[narrator] I asked quickly.\n",
|
135 |
+
"[Daisy] “From Louisville. Our white girlhood was passed together there. Our beautiful white—”\n",
|
136 |
+
"[Tom] “Did you give Nick a little heart to heart talk on the veranda?”\n",
|
137 |
+
"[narrator] demanded Tom suddenly.\n",
|
138 |
+
"[Daisy] “Did I?”\n",
|
139 |
+
"[narrator] She looked at me.\n",
|
140 |
+
"[Daisy] “I can’t seem to remember, but I think we talked about the Nordic race. Yes, I’m sure we did. It sort of crept up on us and first thing you know—”\n",
|
141 |
+
"[Tom] “Don’t believe everything you hear, Nick,”\n",
|
142 |
+
"[narrator] he advised me.\n"
|
143 |
+
]
|
144 |
+
}
|
145 |
+
],
|
146 |
+
"source": [
|
147 |
+
"annotated_text = res.to_character_annotated_text()\n",
|
148 |
+
"print(annotated_text.to_pretty_text())"
|
149 |
]
|
150 |
},
|
151 |
{
|
152 |
"cell_type": "code",
|
153 |
+
"execution_count": 9,
|
154 |
"metadata": {},
|
155 |
"outputs": [
|
156 |
{
|
157 |
"name": "stdout",
|
158 |
"output_type": "stream",
|
159 |
"text": [
|
160 |
+
"LLM usage:\n",
|
161 |
+
"\n",
|
162 |
+
"Tokens Used: 1817\n",
|
163 |
+
"\tPrompt Tokens: 877\n",
|
164 |
+
"\tCompletion Tokens: 940\n",
|
165 |
+
"Successful Requests: 1\n",
|
166 |
+
"Total Cost (USD): $0.0115925\n"
|
|
|
|
|
|
|
167 |
]
|
168 |
}
|
169 |
],
|
170 |
"source": [
|
171 |
+
"print(f'LLM usage:\\n\\n{cb}')"
|
172 |
]
|
173 |
},
|
174 |
{
|
readme.md
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### TODO
|
2 |
+
|
3 |
+
- [ ] prepare text for TTS
|
4 |
+
- [x] prepare prompt to split text into character phrases
|
5 |
+
- [ ] split large text in batches, process each batch separatelly, concat batches
|
6 |
+
- [ ] try to identify unknown characters
|
7 |
+
- [ ] select voices for TTS
|
8 |
+
- [ ] map characters to available voices
|
9 |
+
- [ ] use LLM to recognize characters for a given text and provide descriptions
|
10 |
+
detailed enough to select appropriate voice
|
11 |
+
- [ ] run TTS to create narration
|
12 |
+
- [ ] add effects. mix them with created narration
|
prompts.py → src/prompts.py
RENAMED
@@ -1,4 +1,4 @@
|
|
1 |
-
class
|
2 |
SYSTEM = """\
|
3 |
You are a helpful assistant proficient in literature and language.
|
4 |
Imagine you are helping to prepare the provided text for narration to create the audio book.
|
@@ -37,7 +37,9 @@ Format your answer as a following JSON:
|
|
37 |
|
38 |
Ensure the order of the parts in the JSON output matches the original order of the text.
|
39 |
|
40 |
-
|
|
|
|
|
41 |
{{
|
42 |
"characters": ["Mr. Gatz", "narrator"],
|
43 |
"parts":
|
@@ -53,6 +55,54 @@ Example of text split by characters, already in the target format.
|
|
53 |
{{"character": "Mr. Gatz", "text": "He fumbled at the embroidered coverlet, trying to take it from the bed, and lay down stiffly—was instantly asleep."}},
|
54 |
]
|
55 |
}}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
"""
|
57 |
|
58 |
USER = """\
|
|
|
1 |
+
class SplitTextPromptV1:
|
2 |
SYSTEM = """\
|
3 |
You are a helpful assistant proficient in literature and language.
|
4 |
Imagine you are helping to prepare the provided text for narration to create the audio book.
|
|
|
37 |
|
38 |
Ensure the order of the parts in the JSON output matches the original order of the text.
|
39 |
|
40 |
+
Examples of text split by characters, already in the target format.
|
41 |
+
|
42 |
+
Example 1.
|
43 |
{{
|
44 |
"characters": ["Mr. Gatz", "narrator"],
|
45 |
"parts":
|
|
|
55 |
{{"character": "Mr. Gatz", "text": "He fumbled at the embroidered coverlet, trying to take it from the bed, and lay down stiffly—was instantly asleep."}},
|
56 |
]
|
57 |
}}
|
58 |
+
|
59 |
+
Example 2.
|
60 |
+
{{
|
61 |
+
'characters': [
|
62 |
+
'narrator',
|
63 |
+
'Mr. Carraway',
|
64 |
+
'Daisy',
|
65 |
+
'Miss Baker',
|
66 |
+
'Tom',
|
67 |
+
'Nick'
|
68 |
+
],
|
69 |
+
'parts': [
|
70 |
+
{{'character': 'narrator', 'text': '“If you’ll get up.”'}},
|
71 |
+
{{'character': 'Mr. Carraway', 'text': '“I will. Good night, Mr. Carraway. See you anon.”'}},
|
72 |
+
{{'character': 'Daisy', 'text': '“Of course you will,” confirmed Daisy. “In fact I think I’ll arrange a marriage. Come over often, Nick, and I’ll sort of—oh—fling you together. You know—lock you up accidentally in linen closets and push you out to sea in a boat, and all that sort of thing—”'}},
|
73 |
+
{{'character': 'Miss Baker', 'text': '“Good night,” called Miss Baker from the stairs. “I haven’t heard a word.”'}},
|
74 |
+
{{'character': 'Tom', 'text': '“She’s a nice girl,” said Tom after a moment. “They oughtn’t to let her run around the country this way.”'}},
|
75 |
+
{{'character': 'Daisy', 'text': '“Who oughtn’t to?” inquired Daisy coldly.'}},
|
76 |
+
{{'character': 'narrator', 'text': '“Her family.”'}},
|
77 |
+
{{'character': 'narrator', 'text': '“Her family is one aunt about a thousand years old. Besides, Nick’s going to look after her, aren’t you, Nick? She’s going to spend lots of weekends out here this summer. I think the home influence will be very good for her.”'}},
|
78 |
+
{{'character': 'narrator', 'text': 'Daisy and Tom looked at each other for a moment in silence.'}}
|
79 |
+
]
|
80 |
+
}}
|
81 |
+
"""
|
82 |
+
|
83 |
+
USER = """\
|
84 |
+
Here is the book sample:
|
85 |
+
---
|
86 |
+
{text}"""
|
87 |
+
|
88 |
+
|
89 |
+
class SplitTextPromptV2:
|
90 |
+
SYSTEM = """\
|
91 |
+
you are provided with the book sample.
|
92 |
+
please rewrite it and insert xml tags indicating character to whom current phrase belongs.
|
93 |
+
for example: <narrator>I looked at her</narrator><Jill>What are you looking at?</Jill>
|
94 |
+
|
95 |
+
Notes:
|
96 |
+
- sometimes narrator is one of characters taking part in the action.
|
97 |
+
in this case use narrator's name (if available) instead of "narrator"
|
98 |
+
- if it's impossible to identify character name from the text provided, use codes "c1", "c2", etc,
|
99 |
+
where "c" prefix means character and number is used to enumerate unknown characters
|
100 |
+
- all quotes of direct speech must be attributed to characters, for example:
|
101 |
+
<Tom>“She’s a nice girl,”</Tom><narrator>said Tom after a moment.</narrator>
|
102 |
+
mind that sometimes narrator could also be a character.
|
103 |
+
- use ALL available context to determine the character.
|
104 |
+
sometimes the character name becomes clear from the following phrases
|
105 |
+
- DO NOT include in your response anything except for the original text with character xml tags!!!
|
106 |
"""
|
107 |
|
108 |
USER = """\
|
src/text_split_chain.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
from langchain_core.output_parsers import StrOutputParser
|
4 |
+
from langchain_core.prompts import (
|
5 |
+
ChatPromptTemplate,
|
6 |
+
HumanMessagePromptTemplate,
|
7 |
+
SystemMessagePromptTemplate,
|
8 |
+
)
|
9 |
+
from pydantic import BaseModel
|
10 |
+
|
11 |
+
from src.prompts import SplitTextPromptV1, SplitTextPromptV2
|
12 |
+
from src.utils import GPTModels, get_chat_llm
|
13 |
+
|
14 |
+
|
15 |
+
class CharacterPhrase(BaseModel):
|
16 |
+
character: str
|
17 |
+
text: str
|
18 |
+
|
19 |
+
|
20 |
+
class CharacterAnnotatedText(BaseModel):
|
21 |
+
phrases: list[CharacterPhrase]
|
22 |
+
_characters: list[str]
|
23 |
+
|
24 |
+
def __init__(self, **data):
|
25 |
+
super().__init__(**data)
|
26 |
+
self._characters = list(set(phrase.character for phrase in self.phrases))
|
27 |
+
|
28 |
+
@property
|
29 |
+
def characters(self):
|
30 |
+
return self._characters
|
31 |
+
|
32 |
+
def to_pretty_text(self):
|
33 |
+
lines = []
|
34 |
+
lines.append(f"characters: {self.characters}")
|
35 |
+
lines.append("-" * 20)
|
36 |
+
lines.extend(f"[{phrase.character}] {phrase.text}" for phrase in self.phrases)
|
37 |
+
res = "\n".join(lines)
|
38 |
+
return res
|
39 |
+
|
40 |
+
|
41 |
+
class SplitTextOutputV1(BaseModel):
|
42 |
+
characters: list[str]
|
43 |
+
parts: list[CharacterPhrase]
|
44 |
+
|
45 |
+
def to_character_annotated_text(self):
|
46 |
+
return CharacterAnnotatedText(phrases=self.parts)
|
47 |
+
|
48 |
+
|
49 |
+
def create_split_text_chain_v1(llm_model: GPTModels):
|
50 |
+
llm = get_chat_llm(llm_model=llm_model, temperature=0.0)
|
51 |
+
llm = llm.with_structured_output(SplitTextOutputV1)
|
52 |
+
|
53 |
+
prompt = ChatPromptTemplate.from_messages(
|
54 |
+
[
|
55 |
+
SystemMessagePromptTemplate.from_template(SplitTextPromptV1.SYSTEM),
|
56 |
+
HumanMessagePromptTemplate.from_template(SplitTextPromptV1.USER),
|
57 |
+
]
|
58 |
+
)
|
59 |
+
|
60 |
+
chain = prompt | llm
|
61 |
+
return chain
|
62 |
+
|
63 |
+
|
64 |
+
class SplitTextOutputV2(BaseModel):
|
65 |
+
text_raw: str
|
66 |
+
_phrases: list[CharacterPhrase]
|
67 |
+
|
68 |
+
@staticmethod
|
69 |
+
def _parse_phrases_from_xml_tags(text):
|
70 |
+
"""
|
71 |
+
we rely on LLM to format response correctly.
|
72 |
+
so we don't check that opening xml tags match closing ones
|
73 |
+
"""
|
74 |
+
pattern = re.compile(r"(?:<([^<>]+)>)(.*?)(?:</\1>)")
|
75 |
+
res = pattern.findall(text)
|
76 |
+
res = [CharacterPhrase(character=x[0], text=x[1]) for x in res]
|
77 |
+
return res
|
78 |
+
|
79 |
+
def __init__(self, **data):
|
80 |
+
super().__init__(**data)
|
81 |
+
self._phrases = self._parse_phrases_from_xml_tags(self.text_raw)
|
82 |
+
|
83 |
+
@property
|
84 |
+
def phrases(self):
|
85 |
+
return self._phrases
|
86 |
+
|
87 |
+
def to_character_annotated_text(self):
|
88 |
+
return CharacterAnnotatedText(phrases=self.phrases)
|
89 |
+
|
90 |
+
|
91 |
+
def create_split_text_chain_v2(llm_model: GPTModels):
|
92 |
+
llm = get_chat_llm(llm_model=llm_model, temperature=0.0)
|
93 |
+
|
94 |
+
prompt = ChatPromptTemplate.from_messages(
|
95 |
+
[
|
96 |
+
SystemMessagePromptTemplate.from_template(SplitTextPromptV2.SYSTEM),
|
97 |
+
HumanMessagePromptTemplate.from_template(SplitTextPromptV2.USER),
|
98 |
+
]
|
99 |
+
)
|
100 |
+
|
101 |
+
chain = prompt | llm | StrOutputParser() | (lambda x: SplitTextOutputV2(text_raw=x))
|
102 |
+
return chain
|
src/utils.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from enum import StrEnum
|
2 |
+
|
3 |
+
from httpx import Timeout
|
4 |
+
from langchain_openai import ChatOpenAI
|
5 |
+
|
6 |
+
|
7 |
+
class GPTModels(StrEnum):
|
8 |
+
GPT_4o = "gpt-4o"
|
9 |
+
GPT_4o_MINI = "gpt-4o-mini"
|
10 |
+
GPT_4_TURBO_2024_04_09 = "gpt-4-turbo-2024-04-09"
|
11 |
+
|
12 |
+
|
13 |
+
def get_chat_llm(llm_model: GPTModels, temperature=0.0):
|
14 |
+
llm = ChatOpenAI(
|
15 |
+
model=llm_model, temperature=temperature, timeout=Timeout(60, connect=4)
|
16 |
+
)
|
17 |
+
return llm
|
utils.py
DELETED
@@ -1,49 +0,0 @@
|
|
1 |
-
from enum import StrEnum
|
2 |
-
|
3 |
-
from httpx import Timeout
|
4 |
-
from langchain_core.prompts import (
|
5 |
-
ChatPromptTemplate,
|
6 |
-
HumanMessagePromptTemplate,
|
7 |
-
SystemMessagePromptTemplate,
|
8 |
-
)
|
9 |
-
from langchain_openai import ChatOpenAI
|
10 |
-
from pydantic import BaseModel
|
11 |
-
|
12 |
-
from prompts import SplitTextPrompt
|
13 |
-
|
14 |
-
|
15 |
-
class GPTModels(StrEnum):
|
16 |
-
GPT_4_TURBO_2024_04_09 = "gpt-4-turbo-2024-04-09"
|
17 |
-
GPT_4o_MINI = "gpt-4o-mini"
|
18 |
-
|
19 |
-
|
20 |
-
class TextPart(BaseModel):
|
21 |
-
character: str
|
22 |
-
text: str
|
23 |
-
|
24 |
-
|
25 |
-
class SplitTextOutput(BaseModel):
|
26 |
-
characters: list[str]
|
27 |
-
parts: list[TextPart]
|
28 |
-
|
29 |
-
def to_pretty_text(self):
|
30 |
-
lines = []
|
31 |
-
lines.append(f"characters: {self.characters}")
|
32 |
-
lines.extend(f"[{part.character}] {part.text}" for part in self.parts)
|
33 |
-
res = "\n".join(lines)
|
34 |
-
return res
|
35 |
-
|
36 |
-
|
37 |
-
def create_split_text_chain(llm_model: GPTModels):
|
38 |
-
llm = ChatOpenAI(model=llm_model, temperature=0.0, timeout=Timeout(60, connect=4))
|
39 |
-
llm = llm.with_structured_output(SplitTextOutput)
|
40 |
-
|
41 |
-
prompt = ChatPromptTemplate.from_messages(
|
42 |
-
[
|
43 |
-
SystemMessagePromptTemplate.from_template(SplitTextPrompt.SYSTEM),
|
44 |
-
HumanMessagePromptTemplate.from_template(SplitTextPrompt.USER),
|
45 |
-
]
|
46 |
-
)
|
47 |
-
|
48 |
-
chain = prompt | llm
|
49 |
-
return chain
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|