Commit
•
66148ed
1
Parent(s):
cfb4a98
Adding dataset creation
Browse files- translate-prompts.ipynb +730 -122
translate-prompts.ipynb
CHANGED
@@ -13,6 +13,18 @@
|
|
13 |
{
|
14 |
"cell_type": "code",
|
15 |
"execution_count": null,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
16 |
"id": "b45bd52f-03e9-419f-8110-1013ff45fb1b",
|
17 |
"metadata": {
|
18 |
"id": "b45bd52f-03e9-419f-8110-1013ff45fb1b",
|
@@ -20,12 +32,13 @@
|
|
20 |
},
|
21 |
"outputs": [],
|
22 |
"source": [
|
23 |
-
"from huggingface_hub import InferenceClient, login"
|
|
|
24 |
]
|
25 |
},
|
26 |
{
|
27 |
"cell_type": "code",
|
28 |
-
"execution_count":
|
29 |
"id": "dc9f0411-8bf2-4a20-a6ea-331a2a486b8e",
|
30 |
"metadata": {
|
31 |
"colab": {
|
@@ -41,7 +54,7 @@
|
|
41 |
{
|
42 |
"data": {
|
43 |
"application/vnd.jupyter.widget-view+json": {
|
44 |
-
"model_id": "
|
45 |
"version_major": 2,
|
46 |
"version_minor": 0
|
47 |
},
|
@@ -76,14 +89,16 @@
|
|
76 |
},
|
77 |
{
|
78 |
"cell_type": "code",
|
79 |
-
"execution_count":
|
80 |
"id": "84e6cb89-30d3-4ef5-8063-07783798e045",
|
81 |
"metadata": {
|
82 |
-
"id": "84e6cb89-30d3-4ef5-8063-07783798e045"
|
|
|
83 |
},
|
84 |
"outputs": [],
|
85 |
"source": [
|
86 |
"MODEL = \"CohereForAI/c4ai-command-r-plus\"\n",
|
|
|
87 |
"client = InferenceClient(MODEL)"
|
88 |
]
|
89 |
},
|
@@ -97,7 +112,13 @@
|
|
97 |
"# Translation\n",
|
98 |
"Our goal is to explore translation between English and Arabic and how prompt engineering can impact it. There has been [some work](https://arxiv.org/pdf/2308.01391), but we didn't find as much as we were hoping, especially for open source models.\n",
|
99 |
"\n",
|
100 |
-
"We have created a dataset across 6 domains and want to compare each method by having human rankers. We also have human translations to ground these rankings
|
|
|
|
|
|
|
|
|
|
|
|
|
101 |
]
|
102 |
},
|
103 |
{
|
@@ -127,7 +148,7 @@
|
|
127 |
},
|
128 |
{
|
129 |
"cell_type": "code",
|
130 |
-
"execution_count":
|
131 |
"id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
|
132 |
"metadata": {
|
133 |
"id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
|
@@ -135,7 +156,7 @@
|
|
135 |
},
|
136 |
"outputs": [],
|
137 |
"source": [
|
138 |
-
"
|
139 |
]
|
140 |
},
|
141 |
{
|
@@ -146,7 +167,7 @@
|
|
146 |
},
|
147 |
"source": [
|
148 |
"### Instruction\n",
|
149 |
-
"> Translate this from
|
150 |
">\n",
|
151 |
"> Translation:\n",
|
152 |
"\n",
|
@@ -155,19 +176,20 @@
|
|
155 |
},
|
156 |
{
|
157 |
"cell_type": "code",
|
158 |
-
"execution_count":
|
159 |
"id": "b7f1722c-c484-4e22-a025-53f95943fc76",
|
160 |
"metadata": {
|
161 |
-
"id": "b7f1722c-c484-4e22-a025-53f95943fc76"
|
|
|
162 |
},
|
163 |
"outputs": [],
|
164 |
"source": [
|
165 |
-
"def baseline_chat_completion(
|
166 |
" \"\"\"\n",
|
167 |
" Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
|
168 |
" \"\"\"\n",
|
169 |
" messages = [\n",
|
170 |
-
" {\"role\": \"system\", \"content\":
|
171 |
" {\n",
|
172 |
" \"role\": \"user\",\n",
|
173 |
" \"content\": f\"Translate this from english to arabic: {translation_input}.\\nTranslation: \",\n",
|
@@ -178,7 +200,7 @@
|
|
178 |
},
|
179 |
{
|
180 |
"cell_type": "code",
|
181 |
-
"execution_count":
|
182 |
"id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
|
183 |
"metadata": {
|
184 |
"id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
|
@@ -188,7 +210,6 @@
|
|
188 |
"source": [
|
189 |
"translation_input = \"Float like a butterfly sting like a bee – his hands can’t hit what his eyes can’t see.\"\n",
|
190 |
"response = baseline_chat_completion(\n",
|
191 |
-
" system_prompt,\n",
|
192 |
" translation_input,\n",
|
193 |
")"
|
194 |
]
|
@@ -201,23 +222,21 @@
|
|
201 |
},
|
202 |
"source": [
|
203 |
"### Token Cost\n",
|
204 |
-
"Here we can see that the cost is quite cheap, only
|
205 |
]
|
206 |
},
|
207 |
{
|
208 |
"cell_type": "code",
|
209 |
-
"execution_count":
|
210 |
-
"id": "
|
211 |
"metadata": {
|
212 |
-
"id": "4e305b1e-56e0-44da-8c17-496cbcc35fad",
|
213 |
-
"outputId": "a97a8a75-e2e6-4ac3-e7ac-db674a1f46c9",
|
214 |
"tags": []
|
215 |
},
|
216 |
"outputs": [
|
217 |
{
|
218 |
"data": {
|
219 |
"text/plain": [
|
220 |
-
"
|
221 |
]
|
222 |
},
|
223 |
"execution_count": 7,
|
@@ -226,12 +245,12 @@
|
|
226 |
}
|
227 |
],
|
228 |
"source": [
|
229 |
-
"response.usage.prompt_tokens"
|
230 |
]
|
231 |
},
|
232 |
{
|
233 |
"cell_type": "code",
|
234 |
-
"execution_count":
|
235 |
"id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
|
236 |
"metadata": {
|
237 |
"id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
|
@@ -243,7 +262,7 @@
|
|
243 |
"name": "stdout",
|
244 |
"output_type": "stream",
|
245 |
"text": [
|
246 |
-
"
|
247 |
]
|
248 |
}
|
249 |
],
|
@@ -278,7 +297,7 @@
|
|
278 |
},
|
279 |
{
|
280 |
"cell_type": "code",
|
281 |
-
"execution_count":
|
282 |
"id": "4714f6a2-fd0b-48ee-80bc-860f40ee2baa",
|
283 |
"metadata": {
|
284 |
"id": "4714f6a2-fd0b-48ee-80bc-860f40ee2baa",
|
@@ -316,37 +335,67 @@
|
|
316 |
},
|
317 |
{
|
318 |
"cell_type": "code",
|
319 |
-
"execution_count":
|
320 |
-
"id": "
|
321 |
"metadata": {
|
322 |
-
"
|
323 |
},
|
324 |
"outputs": [],
|
325 |
"source": [
|
326 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
327 |
" \"\"\"\n",
|
328 |
-
" Generates a completion for a chat conversation using a specified system prompt and a user input
|
|
|
329 |
" \"\"\"\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
330 |
"\n",
|
331 |
-
"
|
332 |
-
"Purpose of the translation: {dataset_to_purpose_target[dataset]['purpose']}\n",
|
333 |
-
"Target audience: {dataset_to_purpose_target[dataset]['audience']}\n",
|
334 |
-
"[source text] `{translation_input}`\n",
|
335 |
-
"[translated text] \"\"\"\n",
|
336 |
-
"\n",
|
337 |
" messages = [\n",
|
338 |
-
" {\"role\": \"system\", \"content\":
|
339 |
" {\n",
|
340 |
" \"role\": \"user\",\n",
|
341 |
" \"content\": prompt,\n",
|
342 |
" },\n",
|
343 |
" ]\n",
|
344 |
-
"
|
|
|
|
|
|
|
345 |
]
|
346 |
},
|
347 |
{
|
348 |
"cell_type": "code",
|
349 |
-
"execution_count":
|
350 |
"id": "4115515d-cbcd-405a-b2e0-a805880a40c4",
|
351 |
"metadata": {
|
352 |
"id": "4115515d-cbcd-405a-b2e0-a805880a40c4",
|
@@ -355,23 +404,21 @@
|
|
355 |
"outputs": [],
|
356 |
"source": [
|
357 |
"translation_input = \"We have observed that when groups of stakeholders work to define … visions, this leads to debate over whether to emphasize ecosystem health or human well-being … Whether the priority is ecosystems or people greatly influences stakeholders' assessment of desirable ecological and social states.\"\n",
|
358 |
-
"response = purpose_driven_chat_completion(
|
359 |
]
|
360 |
},
|
361 |
{
|
362 |
"cell_type": "code",
|
363 |
-
"execution_count":
|
364 |
-
"id": "
|
365 |
"metadata": {
|
366 |
-
"id": "6296d255-d11d-4df7-aa0e-1226ef3d963a",
|
367 |
-
"outputId": "45282f39-18ef-4dc0-f708-1113ca1769ce",
|
368 |
"tags": []
|
369 |
},
|
370 |
"outputs": [
|
371 |
{
|
372 |
"data": {
|
373 |
"text/plain": [
|
374 |
-
"
|
375 |
]
|
376 |
},
|
377 |
"execution_count": 12,
|
@@ -380,12 +427,12 @@
|
|
380 |
}
|
381 |
],
|
382 |
"source": [
|
383 |
-
"response.usage.prompt_tokens"
|
384 |
]
|
385 |
},
|
386 |
{
|
387 |
"cell_type": "code",
|
388 |
-
"execution_count":
|
389 |
"id": "1f1c6dd0-11bf-4b88-9029-8bce1e7bcb1c",
|
390 |
"metadata": {
|
391 |
"id": "1f1c6dd0-11bf-4b88-9029-8bce1e7bcb1c",
|
@@ -397,12 +444,18 @@
|
|
397 |
"name": "stdout",
|
398 |
"output_type": "stream",
|
399 |
"text": [
|
400 |
-
"
|
|
|
|
|
|
|
|
|
401 |
]
|
402 |
}
|
403 |
],
|
404 |
"source": [
|
405 |
-
"
|
|
|
|
|
406 |
]
|
407 |
},
|
408 |
{
|
@@ -434,13 +487,15 @@
|
|
434 |
"id": "2f5deb21-18fb-4c5b-9045-c7fe5e751c05"
|
435 |
},
|
436 |
"source": [
|
437 |
-
"Its usually helpful if we tell the LLM what we want to create when we prompt it.
|
438 |
"\n",
|
439 |
-
"> I want to translate the following
|
440 |
-
"{\"
|
441 |
-
"Can you fill this out and be specific to how this can help you translate in the next step? No need to translate yet
|
442 |
-
"
|
443 |
-
"
|
|
|
|
|
444 |
]
|
445 |
},
|
446 |
{
|
@@ -455,7 +510,7 @@
|
|
455 |
},
|
456 |
{
|
457 |
"cell_type": "code",
|
458 |
-
"execution_count":
|
459 |
"id": "affe3668-aa37-47b5-be37-a0bd5dabab56",
|
460 |
"metadata": {
|
461 |
"id": "affe3668-aa37-47b5-be37-a0bd5dabab56",
|
@@ -463,7 +518,7 @@
|
|
463 |
},
|
464 |
"outputs": [],
|
465 |
"source": [
|
466 |
-
"
|
467 |
" {\n",
|
468 |
" \"type\": \"function\",\n",
|
469 |
" \"function\": {\n",
|
@@ -503,38 +558,39 @@
|
|
503 |
},
|
504 |
{
|
505 |
"cell_type": "code",
|
506 |
-
"execution_count":
|
507 |
"id": "b6436ce6-03af-4206-a283-0c2ecd17bd88",
|
508 |
"metadata": {
|
509 |
-
"id": "b6436ce6-03af-4206-a283-0c2ecd17bd88"
|
|
|
510 |
},
|
511 |
"outputs": [],
|
512 |
"source": [
|
513 |
-
"def tool_call_chat_completion(
|
514 |
" \"\"\"\n",
|
515 |
" Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
|
516 |
" \"\"\"\n",
|
517 |
"\n",
|
518 |
-
" prompt = f\"\"\"I want to translate the following
|
519 |
"{{\"subject\": \"\", \"assumptions relating to content\": \"\", \"purpose\": \"\", \"target audience\": \"\"}}.\n",
|
520 |
"Can you fill this out and be specific to how this can help you translate in the next step? No need to translate yet!\n",
|
521 |
-
"
|
522 |
-
"{translation_input}\n",
|
523 |
-
"
|
524 |
"\"\"\"\n",
|
525 |
" messages = [\n",
|
526 |
-
" {\"role\": \"system\", \"content\":
|
527 |
" {\n",
|
528 |
" \"role\": \"user\",\n",
|
529 |
" \"content\": prompt,\n",
|
530 |
" },\n",
|
531 |
" ]\n",
|
532 |
-
" return client.chat_completion(messages, max_tokens=10_000, tools=
|
533 |
]
|
534 |
},
|
535 |
{
|
536 |
"cell_type": "code",
|
537 |
-
"execution_count":
|
538 |
"id": "a731b2c0-54a3-4b8e-83f6-1663c759cf79",
|
539 |
"metadata": {
|
540 |
"id": "a731b2c0-54a3-4b8e-83f6-1663c759cf79",
|
@@ -543,13 +599,13 @@
|
|
543 |
"outputs": [],
|
544 |
"source": [
|
545 |
"translation_input = \"We have observed that when groups of stakeholders work to define … visions, this leads to debate over whether to emphasize ecosystem health or human well-being … Whether the priority is ecosystems or people greatly influences stakeholders' assessment of desirable ecological and social states.\"\n",
|
546 |
-
"response = tool_call_chat_completion(
|
547 |
]
|
548 |
},
|
549 |
{
|
550 |
"cell_type": "code",
|
551 |
-
"execution_count":
|
552 |
-
"id": "
|
553 |
"metadata": {
|
554 |
"id": "07e0f133-f6e1-4bc6-969c-da9d36bfba2f",
|
555 |
"outputId": "82e2506a-7336-4909-b8e4-fc52f671c511",
|
@@ -559,7 +615,7 @@
|
|
559 |
{
|
560 |
"data": {
|
561 |
"text/plain": [
|
562 |
-
"
|
563 |
]
|
564 |
},
|
565 |
"execution_count": 17,
|
@@ -568,13 +624,12 @@
|
|
568 |
}
|
569 |
],
|
570 |
"source": [
|
571 |
-
"
|
572 |
-
"function_call_tokens"
|
573 |
]
|
574 |
},
|
575 |
{
|
576 |
"cell_type": "code",
|
577 |
-
"execution_count":
|
578 |
"id": "02be1827-7137-463f-a026-0b26dec6f552",
|
579 |
"metadata": {
|
580 |
"id": "02be1827-7137-463f-a026-0b26dec6f552",
|
@@ -586,22 +641,21 @@
|
|
586 |
"name": "stdout",
|
587 |
"output_type": "stream",
|
588 |
"text": [
|
589 |
-
"{'assumptions relating to the content': 'The text assumes that
|
590 |
-
" '
|
591 |
-
" '
|
592 |
-
" '
|
593 |
-
" '
|
594 |
-
"
|
595 |
-
"
|
596 |
-
"
|
597 |
-
" '
|
598 |
-
" 'purpose': '
|
599 |
-
" '
|
600 |
-
" '
|
601 |
-
" 'health
|
602 |
-
"
|
603 |
-
"
|
604 |
-
" 'Stakeholder Groups'}\n"
|
605 |
]
|
606 |
}
|
607 |
],
|
@@ -613,38 +667,36 @@
|
|
613 |
},
|
614 |
{
|
615 |
"cell_type": "code",
|
616 |
-
"execution_count":
|
617 |
"id": "7575bd09-2d20-49ae-bb10-162a0e469f16",
|
618 |
"metadata": {
|
619 |
-
"id": "7575bd09-2d20-49ae-bb10-162a0e469f16"
|
|
|
620 |
},
|
621 |
"outputs": [],
|
622 |
"source": [
|
623 |
-
"def automatic_purpose_driven_chat_completion(
|
624 |
" \"\"\"\n",
|
625 |
" Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
|
626 |
" \"\"\"\n",
|
627 |
"\n",
|
628 |
-
" prompt = f\"\"\"Given the following
|
629 |
-
"{
|
630 |
-
"
|
631 |
-
"{translation_input}\n",
|
632 |
-
"
|
633 |
"Translation:\n",
|
634 |
"\"\"\"\n",
|
635 |
" messages = [\n",
|
636 |
-
" {\"role\": \"system\", \"content\":
|
637 |
-
" {\n",
|
638 |
-
" \"role\": \"user\",\n",
|
639 |
-
" \"content\": prompt,\n",
|
640 |
-
" },\n",
|
641 |
" ]\n",
|
642 |
" return client.chat_completion(messages, max_tokens=10_000)"
|
643 |
]
|
644 |
},
|
645 |
{
|
646 |
"cell_type": "code",
|
647 |
-
"execution_count":
|
648 |
"id": "32bccca0-c866-4006-84d1-d5b783b73689",
|
649 |
"metadata": {
|
650 |
"id": "32bccca0-c866-4006-84d1-d5b783b73689",
|
@@ -652,12 +704,12 @@
|
|
652 |
},
|
653 |
"outputs": [],
|
654 |
"source": [
|
655 |
-
"response = automatic_purpose_driven_chat_completion(
|
656 |
]
|
657 |
},
|
658 |
{
|
659 |
"cell_type": "code",
|
660 |
-
"execution_count":
|
661 |
"id": "b0867efb-39ea-4f9a-b073-2a84261f3821",
|
662 |
"metadata": {
|
663 |
"id": "b0867efb-39ea-4f9a-b073-2a84261f3821",
|
@@ -668,7 +720,7 @@
|
|
668 |
{
|
669 |
"data": {
|
670 |
"text/plain": [
|
671 |
-
"
|
672 |
]
|
673 |
},
|
674 |
"execution_count": 21,
|
@@ -677,13 +729,12 @@
|
|
677 |
}
|
678 |
],
|
679 |
"source": [
|
680 |
-
"
|
681 |
-
"automatic_purpose_driven_tokens"
|
682 |
]
|
683 |
},
|
684 |
{
|
685 |
"cell_type": "code",
|
686 |
-
"execution_count":
|
687 |
"id": "462ca84c-9ffd-4924-880d-e06b724caf02",
|
688 |
"metadata": {
|
689 |
"id": "462ca84c-9ffd-4924-880d-e06b724caf02",
|
@@ -695,9 +746,15 @@
|
|
695 |
"name": "stdout",
|
696 |
"output_type": "stream",
|
697 |
"text": [
|
698 |
-
"
|
699 |
-
"\n",
|
700 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
701 |
]
|
702 |
}
|
703 |
],
|
@@ -707,35 +764,586 @@
|
|
707 |
},
|
708 |
{
|
709 |
"cell_type": "markdown",
|
710 |
-
"id": "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
711 |
"metadata": {
|
712 |
-
"
|
713 |
},
|
|
|
714 |
"source": [
|
715 |
-
"
|
|
|
|
|
|
|
716 |
]
|
717 |
},
|
718 |
{
|
719 |
"cell_type": "code",
|
720 |
-
"execution_count":
|
721 |
-
"id": "
|
722 |
"metadata": {
|
723 |
-
"id": "6fc55725-216f-45dd-9c6d-dae77e16d606",
|
724 |
-
"outputId": "d3b9b994-e3af-4f05-88b4-3b29778d1dc7",
|
725 |
"tags": []
|
726 |
},
|
727 |
"outputs": [
|
728 |
{
|
729 |
"data": {
|
730 |
"text/plain": [
|
731 |
-
"
|
732 |
]
|
733 |
},
|
734 |
-
"execution_count":
|
735 |
"metadata": {},
|
736 |
"output_type": "execute_result"
|
737 |
}
|
738 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
739 |
"source": [
|
740 |
"from huggingface_hub import HfApi\n",
|
741 |
"\n",
|
|
|
13 |
{
|
14 |
"cell_type": "code",
|
15 |
"execution_count": null,
|
16 |
+
"id": "ddb3dfde-39cc-4ad9-917e-48413add2d9b",
|
17 |
+
"metadata": {
|
18 |
+
"tags": []
|
19 |
+
},
|
20 |
+
"outputs": [],
|
21 |
+
"source": [
|
22 |
+
"%pip install -U -q transformers huggingface-hub"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"cell_type": "code",
|
27 |
+
"execution_count": 1,
|
28 |
"id": "b45bd52f-03e9-419f-8110-1013ff45fb1b",
|
29 |
"metadata": {
|
30 |
"id": "b45bd52f-03e9-419f-8110-1013ff45fb1b",
|
|
|
32 |
},
|
33 |
"outputs": [],
|
34 |
"source": [
|
35 |
+
"from huggingface_hub import InferenceClient, login\n",
|
36 |
+
"from transformers import AutoTokenizer"
|
37 |
]
|
38 |
},
|
39 |
{
|
40 |
"cell_type": "code",
|
41 |
+
"execution_count": 2,
|
42 |
"id": "dc9f0411-8bf2-4a20-a6ea-331a2a486b8e",
|
43 |
"metadata": {
|
44 |
"colab": {
|
|
|
54 |
{
|
55 |
"data": {
|
56 |
"application/vnd.jupyter.widget-view+json": {
|
57 |
+
"model_id": "da0ae7fafffb4005a5325a53896feb82",
|
58 |
"version_major": 2,
|
59 |
"version_minor": 0
|
60 |
},
|
|
|
89 |
},
|
90 |
{
|
91 |
"cell_type": "code",
|
92 |
+
"execution_count": 3,
|
93 |
"id": "84e6cb89-30d3-4ef5-8063-07783798e045",
|
94 |
"metadata": {
|
95 |
+
"id": "84e6cb89-30d3-4ef5-8063-07783798e045",
|
96 |
+
"tags": []
|
97 |
},
|
98 |
"outputs": [],
|
99 |
"source": [
|
100 |
"MODEL = \"CohereForAI/c4ai-command-r-plus\"\n",
|
101 |
+
"tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)\n",
|
102 |
"client = InferenceClient(MODEL)"
|
103 |
]
|
104 |
},
|
|
|
112 |
"# Translation\n",
|
113 |
"Our goal is to explore translation between English and Arabic and how prompt engineering can impact it. There has been [some work](https://arxiv.org/pdf/2308.01391), but we didn't find as much as we were hoping, especially for open source models.\n",
|
114 |
"\n",
|
115 |
+
"We have created a dataset [arabic-translation-prompt-engineering/TpDwD](https://huggingface.co/datasets/arabic-translation-prompt-engineering/TpDwD) across 6 domains and want to compare each method by having human rankers. We also have human translations to ground these rankings.\n",
|
116 |
+
"\n",
|
117 |
+
"We will evaluate the following methods:\n",
|
118 |
+
"- Baseline\n",
|
119 |
+
"- Manual Purpose Driven\n",
|
120 |
+
"- Automatic Purpose Driven\n",
|
121 |
+
"- Automatic Motivation Driven"
|
122 |
]
|
123 |
},
|
124 |
{
|
|
|
148 |
},
|
149 |
{
|
150 |
"cell_type": "code",
|
151 |
+
"execution_count": 4,
|
152 |
"id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
|
153 |
"metadata": {
|
154 |
"id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
|
|
|
156 |
},
|
157 |
"outputs": [],
|
158 |
"source": [
|
159 |
+
"baseline_system_prompt = \"\"\"You are a skilled translator with extensive experience in English and Arabic translations. You possess a deep understanding of the linguistic, cultural, and contextual nuances essential for accurate and effective translation between these languages. Highly motivated and detail-oriented, you are committed to delivering translations that maintain the integrity and intent of the original text. Your role is crucial in ensuring clear and precise communication in our multilingual system.\"\"\""
|
160 |
]
|
161 |
},
|
162 |
{
|
|
|
167 |
},
|
168 |
"source": [
|
169 |
"### Instruction\n",
|
170 |
+
"> Translate this from english to arabic: {translation_input}.\n",
|
171 |
">\n",
|
172 |
"> Translation:\n",
|
173 |
"\n",
|
|
|
176 |
},
|
177 |
{
|
178 |
"cell_type": "code",
|
179 |
+
"execution_count": 5,
|
180 |
"id": "b7f1722c-c484-4e22-a025-53f95943fc76",
|
181 |
"metadata": {
|
182 |
+
"id": "b7f1722c-c484-4e22-a025-53f95943fc76",
|
183 |
+
"tags": []
|
184 |
},
|
185 |
"outputs": [],
|
186 |
"source": [
|
187 |
+
"def baseline_chat_completion(translation_input):\n",
|
188 |
" \"\"\"\n",
|
189 |
" Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
|
190 |
" \"\"\"\n",
|
191 |
" messages = [\n",
|
192 |
+
" {\"role\": \"system\", \"content\": baseline_system_prompt},\n",
|
193 |
" {\n",
|
194 |
" \"role\": \"user\",\n",
|
195 |
" \"content\": f\"Translate this from english to arabic: {translation_input}.\\nTranslation: \",\n",
|
|
|
200 |
},
|
201 |
{
|
202 |
"cell_type": "code",
|
203 |
+
"execution_count": 6,
|
204 |
"id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
|
205 |
"metadata": {
|
206 |
"id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
|
|
|
210 |
"source": [
|
211 |
"translation_input = \"Float like a butterfly sting like a bee – his hands can’t hit what his eyes can’t see.\"\n",
|
212 |
"response = baseline_chat_completion(\n",
|
|
|
213 |
" translation_input,\n",
|
214 |
")"
|
215 |
]
|
|
|
222 |
},
|
223 |
"source": [
|
224 |
"### Token Cost\n",
|
225 |
+
"Here we can see that the cost is quite cheap, only 96 tokens!"
|
226 |
]
|
227 |
},
|
228 |
{
|
229 |
"cell_type": "code",
|
230 |
+
"execution_count": 7,
|
231 |
+
"id": "2afc890f-5d8d-4df8-b19a-25888211cf18",
|
232 |
"metadata": {
|
|
|
|
|
233 |
"tags": []
|
234 |
},
|
235 |
"outputs": [
|
236 |
{
|
237 |
"data": {
|
238 |
"text/plain": [
|
239 |
+
"'Baseline Total Prompt tokens: 96'"
|
240 |
]
|
241 |
},
|
242 |
"execution_count": 7,
|
|
|
245 |
}
|
246 |
],
|
247 |
"source": [
|
248 |
+
"f\"Baseline Total Prompt tokens: {response.usage.prompt_tokens - len(tokenizer(translation_input, return_tensors='pt')['input_ids'][0])}\""
|
249 |
]
|
250 |
},
|
251 |
{
|
252 |
"cell_type": "code",
|
253 |
+
"execution_count": 8,
|
254 |
"id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
|
255 |
"metadata": {
|
256 |
"id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
|
|
|
262 |
"name": "stdout",
|
263 |
"output_type": "stream",
|
264 |
"text": [
|
265 |
+
"يسبح في الحلبة كالفراشة ويلسع كالنحلة - لا يمكن ليديه أن تصيبا ما لا تستطيع عيناه رؤيته\n"
|
266 |
]
|
267 |
}
|
268 |
],
|
|
|
297 |
},
|
298 |
{
|
299 |
"cell_type": "code",
|
300 |
+
"execution_count": 9,
|
301 |
"id": "4714f6a2-fd0b-48ee-80bc-860f40ee2baa",
|
302 |
"metadata": {
|
303 |
"id": "4714f6a2-fd0b-48ee-80bc-860f40ee2baa",
|
|
|
335 |
},
|
336 |
{
|
337 |
"cell_type": "code",
|
338 |
+
"execution_count": 10,
|
339 |
+
"id": "d0c3418e-0b87-458f-8517-1ca3e59ab57a",
|
340 |
"metadata": {
|
341 |
+
"tags": []
|
342 |
},
|
343 |
"outputs": [],
|
344 |
"source": [
|
345 |
+
"# Define the translation tool function\n",
|
346 |
+
"purpose_driven_translation_tools = [\n",
|
347 |
+
" {\n",
|
348 |
+
" \"type\": \"function\",\n",
|
349 |
+
" \"function\": {\n",
|
350 |
+
" \"name\": \"purpose_driven_translation\",\n",
|
351 |
+
" \"description\": \"Translate given the purpose and the target audience.\",\n",
|
352 |
+
" \"parameters\": {\n",
|
353 |
+
" \"type\": \"object\",\n",
|
354 |
+
" \"properties\": {\n",
|
355 |
+
" \"translation\": {\n",
|
356 |
+
" \"type\": \"string\",\n",
|
357 |
+
" \"description\": \"The translated \\\"source_text\\\".\",\n",
|
358 |
+
" },\n",
|
359 |
+
" },\n",
|
360 |
+
" \"required\": [\"translation\"],\n",
|
361 |
+
" },\n",
|
362 |
+
" },\n",
|
363 |
+
" }\n",
|
364 |
+
"]\n",
|
365 |
+
"\n",
|
366 |
+
"# Create the purpose-driven chat completion function using function calling\n",
|
367 |
+
"def purpose_driven_chat_completion(translation_input, dataset):\n",
|
368 |
" \"\"\"\n",
|
369 |
+
" Generates a completion for a chat conversation using a specified system prompt and a user input,\n",
|
370 |
+
" incorporating function calling to retrieve translation context.\n",
|
371 |
" \"\"\"\n",
|
372 |
+
" \n",
|
373 |
+
" # Prepare the prompt\n",
|
374 |
+
" prompt = f\"\"\"Translate the English \"source text\" into Arabic. Please fulfill the \"Purpose of the translation\" and tailor it to the \"target audience\". Respond in a json format with just the translation as the key.\n",
|
375 |
+
"{{\n",
|
376 |
+
" \"Purpose of the translation\": \"{dataset_to_purpose_target[dataset]['purpose']}\"\n",
|
377 |
+
" \"Target audience\": \"{dataset_to_purpose_target[dataset]['audience']}\"\n",
|
378 |
+
" \"source text\" `{translation_input}`\n",
|
379 |
+
"}} \n",
|
380 |
+
"Translation json: \"\"\"\n",
|
381 |
"\n",
|
382 |
+
" # Initial messages, including the function call to get context\n",
|
|
|
|
|
|
|
|
|
|
|
383 |
" messages = [\n",
|
384 |
+
" {\"role\": \"system\", \"content\": baseline_system_prompt},\n",
|
385 |
" {\n",
|
386 |
" \"role\": \"user\",\n",
|
387 |
" \"content\": prompt,\n",
|
388 |
" },\n",
|
389 |
" ]\n",
|
390 |
+
"\n",
|
391 |
+
" \n",
|
392 |
+
" # Call the chat completion API with the function tools and specific tool choice\n",
|
393 |
+
" return client.chat_completion(messages, max_tokens=10_000, tools=purpose_driven_translation_tools, tool_choice='purpose_driven_translation')"
|
394 |
]
|
395 |
},
|
396 |
{
|
397 |
"cell_type": "code",
|
398 |
+
"execution_count": 11,
|
399 |
"id": "4115515d-cbcd-405a-b2e0-a805880a40c4",
|
400 |
"metadata": {
|
401 |
"id": "4115515d-cbcd-405a-b2e0-a805880a40c4",
|
|
|
404 |
"outputs": [],
|
405 |
"source": [
|
406 |
"translation_input = \"We have observed that when groups of stakeholders work to define … visions, this leads to debate over whether to emphasize ecosystem health or human well-being … Whether the priority is ecosystems or people greatly influences stakeholders' assessment of desirable ecological and social states.\"\n",
|
407 |
+
"response = purpose_driven_chat_completion(translation_input, \"ELRC-24ss\")"
|
408 |
]
|
409 |
},
|
410 |
{
|
411 |
"cell_type": "code",
|
412 |
+
"execution_count": 12,
|
413 |
+
"id": "51cc3241-ef6d-43e8-8740-defb6f542918",
|
414 |
"metadata": {
|
|
|
|
|
415 |
"tags": []
|
416 |
},
|
417 |
"outputs": [
|
418 |
{
|
419 |
"data": {
|
420 |
"text/plain": [
|
421 |
+
"'Manual Purpose Driven Total Prompt tokens: 350'"
|
422 |
]
|
423 |
},
|
424 |
"execution_count": 12,
|
|
|
427 |
}
|
428 |
],
|
429 |
"source": [
|
430 |
+
"f\"Manual Purpose Driven Total Prompt tokens: {response.usage.prompt_tokens - len(tokenizer(translation_input, return_tensors='pt')['input_ids'][0])}\""
|
431 |
]
|
432 |
},
|
433 |
{
|
434 |
"cell_type": "code",
|
435 |
+
"execution_count": 13,
|
436 |
"id": "1f1c6dd0-11bf-4b88-9029-8bce1e7bcb1c",
|
437 |
"metadata": {
|
438 |
"id": "1f1c6dd0-11bf-4b88-9029-8bce1e7bcb1c",
|
|
|
444 |
"name": "stdout",
|
445 |
"output_type": "stream",
|
446 |
"text": [
|
447 |
+
"{'translation': 'لاحظنا أنه عندما تعمل مجموعات أصحاب المصلحة على تحديد ... '\n",
|
448 |
+
" 'الرؤى، فإن هذا يؤدي إلى نقاش حول ما إذا كان ينبغي التركيز على '\n",
|
449 |
+
" 'صحة النظام البيئي أو رفاهية الإنسان ... إن مسألة ما إذا كان '\n",
|
450 |
+
" 'الأولوية للنظم البيئية أو الناس تؤثر بشكل كبير على تقييم '\n",
|
451 |
+
" 'أصحاب المصلحة للحالات الاجتماعية والبيئية المرغوبة.'}\n"
|
452 |
]
|
453 |
}
|
454 |
],
|
455 |
"source": [
|
456 |
+
"from pprint import pprint\n",
|
457 |
+
"description_json = response.choices[0].message.tool_calls[0].function.arguments\n",
|
458 |
+
"pprint(description_json)"
|
459 |
]
|
460 |
},
|
461 |
{
|
|
|
487 |
"id": "2f5deb21-18fb-4c5b-9045-c7fe5e751c05"
|
488 |
},
|
489 |
"source": [
|
490 |
+
"Its usually helpful if we tell the LLM what we want to create when we prompt it. \n",
|
491 |
"\n",
|
492 |
+
"> ```I want to translate the following source_text from English into Arabic. But first I want to create a json that includes the following:\n",
|
493 |
+
"{\"subject\": \"\", \"assumptions relating to content\": \"\", \"purpose\": \"\", \"target audience\": \"\"}.\n",
|
494 |
+
"Can you fill this out and be specific to how this can help you translate in the next step? No need to translate yet!\n",
|
495 |
+
"{\n",
|
496 |
+
" \"source_text\": {translation_input}\n",
|
497 |
+
"}```\n",
|
498 |
+
"\n"
|
499 |
]
|
500 |
},
|
501 |
{
|
|
|
510 |
},
|
511 |
{
|
512 |
"cell_type": "code",
|
513 |
+
"execution_count": 14,
|
514 |
"id": "affe3668-aa37-47b5-be37-a0bd5dabab56",
|
515 |
"metadata": {
|
516 |
"id": "affe3668-aa37-47b5-be37-a0bd5dabab56",
|
|
|
518 |
},
|
519 |
"outputs": [],
|
520 |
"source": [
|
521 |
+
"automatic_purpose_driven_translation_tools = [\n",
|
522 |
" {\n",
|
523 |
" \"type\": \"function\",\n",
|
524 |
" \"function\": {\n",
|
|
|
558 |
},
|
559 |
{
|
560 |
"cell_type": "code",
|
561 |
+
"execution_count": 15,
|
562 |
"id": "b6436ce6-03af-4206-a283-0c2ecd17bd88",
|
563 |
"metadata": {
|
564 |
+
"id": "b6436ce6-03af-4206-a283-0c2ecd17bd88",
|
565 |
+
"tags": []
|
566 |
},
|
567 |
"outputs": [],
|
568 |
"source": [
|
569 |
+
"def tool_call_chat_completion(translation_input):\n",
|
570 |
" \"\"\"\n",
|
571 |
" Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
|
572 |
" \"\"\"\n",
|
573 |
"\n",
|
574 |
+
" prompt = f\"\"\"I want to translate the following source_text from English into Arabic. But first I want to create a json that includes the following:\n",
|
575 |
"{{\"subject\": \"\", \"assumptions relating to content\": \"\", \"purpose\": \"\", \"target audience\": \"\"}}.\n",
|
576 |
"Can you fill this out and be specific to how this can help you translate in the next step? No need to translate yet!\n",
|
577 |
+
"{{\n",
|
578 |
+
" \"source_text\": {translation_input}\n",
|
579 |
+
"}}\n",
|
580 |
"\"\"\"\n",
|
581 |
" messages = [\n",
|
582 |
+
" {\"role\": \"system\", \"content\": baseline_system_prompt},\n",
|
583 |
" {\n",
|
584 |
" \"role\": \"user\",\n",
|
585 |
" \"content\": prompt,\n",
|
586 |
" },\n",
|
587 |
" ]\n",
|
588 |
+
" return client.chat_completion(messages, max_tokens=10_000, tools=automatic_purpose_driven_translation_tools, tool_choice='get_translation_audience_purpose')"
|
589 |
]
|
590 |
},
|
591 |
{
|
592 |
"cell_type": "code",
|
593 |
+
"execution_count": 16,
|
594 |
"id": "a731b2c0-54a3-4b8e-83f6-1663c759cf79",
|
595 |
"metadata": {
|
596 |
"id": "a731b2c0-54a3-4b8e-83f6-1663c759cf79",
|
|
|
599 |
"outputs": [],
|
600 |
"source": [
|
601 |
"translation_input = \"We have observed that when groups of stakeholders work to define … visions, this leads to debate over whether to emphasize ecosystem health or human well-being … Whether the priority is ecosystems or people greatly influences stakeholders' assessment of desirable ecological and social states.\"\n",
|
602 |
+
"response = tool_call_chat_completion(translation_input)"
|
603 |
]
|
604 |
},
|
605 |
{
|
606 |
"cell_type": "code",
|
607 |
+
"execution_count": 17,
|
608 |
+
"id": "f87110a2-a40b-4d65-a12d-728dbdda8fbe",
|
609 |
"metadata": {
|
610 |
"id": "07e0f133-f6e1-4bc6-969c-da9d36bfba2f",
|
611 |
"outputId": "82e2506a-7336-4909-b8e4-fc52f671c511",
|
|
|
615 |
{
|
616 |
"data": {
|
617 |
"text/plain": [
|
618 |
+
"'Function Calling Prompt tokens: 406'"
|
619 |
]
|
620 |
},
|
621 |
"execution_count": 17,
|
|
|
624 |
}
|
625 |
],
|
626 |
"source": [
|
627 |
+
"f\"Function Calling Prompt tokens: {response.usage.prompt_tokens - len(tokenizer(translation_input, return_tensors='pt')['input_ids'][0])}\""
|
|
|
628 |
]
|
629 |
},
|
630 |
{
|
631 |
"cell_type": "code",
|
632 |
+
"execution_count": 18,
|
633 |
"id": "02be1827-7137-463f-a026-0b26dec6f552",
|
634 |
"metadata": {
|
635 |
"id": "02be1827-7137-463f-a026-0b26dec6f552",
|
|
|
641 |
"name": "stdout",
|
642 |
"output_type": "stream",
|
643 |
"text": [
|
644 |
+
"{'assumptions relating to the content': 'The source text assumes that there is '\n",
|
645 |
+
" 'a debate between ecological health '\n",
|
646 |
+
" 'and human well-being, and that '\n",
|
647 |
+
" 'stakeholders have different '\n",
|
648 |
+
" 'priorities that influence their '\n",
|
649 |
+
" 'assessment of desirable ecological '\n",
|
650 |
+
" 'and social outcomes.',\n",
|
651 |
+
" 'audience': 'Individuals interested in environmental policy, ecology, '\n",
|
652 |
+
" 'sustainability, and/or stakeholder engagement.',\n",
|
653 |
+
" 'purpose': 'To communicate observations about the varying priorities of '\n",
|
654 |
+
" 'different stakeholder groups and how these priorities impact '\n",
|
655 |
+
" 'their definition of vision, particularly in the context of '\n",
|
656 |
+
" 'ecosystem health versus human well-being.',\n",
|
657 |
+
" 'subject': 'Stakeholder priorities and their impact on defining visions '\n",
|
658 |
+
" 'related to ecological and social outcomes.'}\n"
|
|
|
659 |
]
|
660 |
}
|
661 |
],
|
|
|
667 |
},
|
668 |
{
|
669 |
"cell_type": "code",
|
670 |
+
"execution_count": 19,
|
671 |
"id": "7575bd09-2d20-49ae-bb10-162a0e469f16",
|
672 |
"metadata": {
|
673 |
+
"id": "7575bd09-2d20-49ae-bb10-162a0e469f16",
|
674 |
+
"tags": []
|
675 |
},
|
676 |
"outputs": [],
|
677 |
"source": [
|
678 |
+
"def automatic_purpose_driven_chat_completion(translation_input, description_json):\n",
|
679 |
" \"\"\"\n",
|
680 |
" Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
|
681 |
" \"\"\"\n",
|
682 |
"\n",
|
683 |
+
" prompt = f\"\"\"Given the following description translate source_text from English to Arabic\n",
|
684 |
+
"{{\n",
|
685 |
+
" \"description\": {description_json},\n",
|
686 |
+
" \"translation\": {translation_input}\n",
|
687 |
+
"}}\n",
|
688 |
"Translation:\n",
|
689 |
"\"\"\"\n",
|
690 |
" messages = [\n",
|
691 |
+
" {\"role\": \"system\", \"content\": baseline_system_prompt},\n",
|
692 |
+
" {\"role\": \"user\", \"content\": prompt},\n",
|
|
|
|
|
|
|
693 |
" ]\n",
|
694 |
" return client.chat_completion(messages, max_tokens=10_000)"
|
695 |
]
|
696 |
},
|
697 |
{
|
698 |
"cell_type": "code",
|
699 |
+
"execution_count": 20,
|
700 |
"id": "32bccca0-c866-4006-84d1-d5b783b73689",
|
701 |
"metadata": {
|
702 |
"id": "32bccca0-c866-4006-84d1-d5b783b73689",
|
|
|
704 |
},
|
705 |
"outputs": [],
|
706 |
"source": [
|
707 |
+
"response = automatic_purpose_driven_chat_completion(translation_input, description_json)"
|
708 |
]
|
709 |
},
|
710 |
{
|
711 |
"cell_type": "code",
|
712 |
+
"execution_count": 21,
|
713 |
"id": "b0867efb-39ea-4f9a-b073-2a84261f3821",
|
714 |
"metadata": {
|
715 |
"id": "b0867efb-39ea-4f9a-b073-2a84261f3821",
|
|
|
720 |
{
|
721 |
"data": {
|
722 |
"text/plain": [
|
723 |
+
"'Automatic Purpose Driven Total Prompt tokens: 235'"
|
724 |
]
|
725 |
},
|
726 |
"execution_count": 21,
|
|
|
729 |
}
|
730 |
],
|
731 |
"source": [
|
732 |
+
"f\"Automatic Purpose Driven Total Prompt tokens: {response.usage.prompt_tokens - len(tokenizer(translation_input, return_tensors='pt')['input_ids'][0])}\""
|
|
|
733 |
]
|
734 |
},
|
735 |
{
|
736 |
"cell_type": "code",
|
737 |
+
"execution_count": 22,
|
738 |
"id": "462ca84c-9ffd-4924-880d-e06b724caf02",
|
739 |
"metadata": {
|
740 |
"id": "462ca84c-9ffd-4924-880d-e06b724caf02",
|
|
|
746 |
"name": "stdout",
|
747 |
"output_type": "stream",
|
748 |
"text": [
|
749 |
+
"{\n",
|
750 |
+
" \"description\": {\n",
|
751 |
+
" \"الافتراضات المتعلقة بالمحتوى\": \"يفترض النص المصدري وجود نقاش بين الصحة البيئية ورفاهية الإنسان، وأن أصحاب المصلحة لديهم أولويات مختلفة تؤثر على تقييمهم للنتائج البيئية والاجتماعية المرجوة.\",\n",
|
752 |
+
" \"الجمهور\": \"الأفراد المهتمون بالسياسة البيئية، أو علم البيئة، أو الاستدامة، و/أو مشاركة أصحاب المصلحة.\",\n",
|
753 |
+
" \"الغرض\": \"إيصال الملاحظات حول الأولويات المتنوعة لمجموعات أصحاب المصلحة المختلفة، وكيف تؤثر هذه الأولويات على تعريفهم للرؤى، خاصة في سياق صحة الأنظمة البيئية مقابل رفاهية الإنسان.\",\n",
|
754 |
+
" \"الموضوع\": \"أولويات أصحاب المصلحة وتأثيرها على تحديد الرؤى المتعلقة بالنتائج البيئية والاجتماعية.\"\n",
|
755 |
+
" },\n",
|
756 |
+
" \"الترجمة\": \"لاحظنا أنه عندما تعمل مجموعات أصحاب المصلحة على تحديد ... الرؤى، فإن هذا يؤدي إلى نقاش حول ما إذا كان ينبغي التأكيد على صحة النظام البيئي أو رفاهية الإنسان ... سواء كانت الأولوية للنظم البيئية أو للبشر يؤثر بشكل كبير على تقييم أصحاب المصلحة للحالات البيئية والاجتماعية المرغوبة.\"\n",
|
757 |
+
"}\n"
|
758 |
]
|
759 |
}
|
760 |
],
|
|
|
764 |
},
|
765 |
{
|
766 |
"cell_type": "markdown",
|
767 |
+
"id": "28a0e518-358e-44f0-97a4-ea76a5563743",
|
768 |
+
"metadata": {},
|
769 |
+
"source": [
|
770 |
+
"### Helper Function"
|
771 |
+
]
|
772 |
+
},
|
773 |
+
{
|
774 |
+
"cell_type": "code",
|
775 |
+
"execution_count": 23,
|
776 |
+
"id": "7fbf9e01-23d9-41b0-a1e3-fd7a99c55bd0",
|
777 |
"metadata": {
|
778 |
+
"tags": []
|
779 |
},
|
780 |
+
"outputs": [],
|
781 |
"source": [
|
782 |
+
"def automatic_purpose_driven_chat(translation_input):\n",
|
783 |
+
" response = tool_call_chat_completion(translation_input)\n",
|
784 |
+
" description_json = response.choices[0].message.tool_calls[0].function.arguments\n",
|
785 |
+
" return automatic_purpose_driven_chat_completion(translation_input, description_json)"
|
786 |
]
|
787 |
},
|
788 |
{
|
789 |
"cell_type": "code",
|
790 |
+
"execution_count": 24,
|
791 |
+
"id": "8c5dd9e6-44c1-4ac7-92ef-c2e594b7b91d",
|
792 |
"metadata": {
|
|
|
|
|
793 |
"tags": []
|
794 |
},
|
795 |
"outputs": [
|
796 |
{
|
797 |
"data": {
|
798 |
"text/plain": [
|
799 |
+
"'الافتراضات المتعلقة بالمحتوى: لا توجد افتراضات محددة.\\n\\nالجمهور المستهدف: جمهور عام لا يحتاج إلى معرفة تقنية محددة.\\n\\nالغرض: نقل رسالة بسيطة لاختبار الترجمة.\\n\\nالموضوع: اختبار الترجمة\\n\\nالترجمة: هذا اختبار'"
|
800 |
]
|
801 |
},
|
802 |
+
"execution_count": 24,
|
803 |
"metadata": {},
|
804 |
"output_type": "execute_result"
|
805 |
}
|
806 |
],
|
807 |
+
"source": [
|
808 |
+
"automatic_purpose_driven_chat(\"This is a test\").choices[0].message.content"
|
809 |
+
]
|
810 |
+
},
|
811 |
+
{
|
812 |
+
"cell_type": "markdown",
|
813 |
+
"id": "4abf1aeb-ee4e-4b8f-97c3-e6b1664ac8b8",
|
814 |
+
"metadata": {
|
815 |
+
"id": "1ec3b20b-8393-4fda-a51d-cf67984cc166"
|
816 |
+
},
|
817 |
+
"source": [
|
818 |
+
"## Dataset Creation"
|
819 |
+
]
|
820 |
+
},
|
821 |
+
{
|
822 |
+
"cell_type": "code",
|
823 |
+
"execution_count": 25,
|
824 |
+
"id": "4b2e3951-6704-43d5-a69b-7587f26e6491",
|
825 |
+
"metadata": {
|
826 |
+
"tags": []
|
827 |
+
},
|
828 |
+
"outputs": [
|
829 |
+
{
|
830 |
+
"name": "stderr",
|
831 |
+
"output_type": "stream",
|
832 |
+
"text": [
|
833 |
+
"huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
|
834 |
+
"To disable this warning, you can either:\n",
|
835 |
+
"\t- Avoid using `tokenizers` before the fork if possible\n",
|
836 |
+
"\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
|
837 |
+
]
|
838 |
+
},
|
839 |
+
{
|
840 |
+
"data": {
|
841 |
+
"application/vnd.jupyter.widget-view+json": {
|
842 |
+
"model_id": "e983537088ae4305ada7aff87127eaa2",
|
843 |
+
"version_major": 2,
|
844 |
+
"version_minor": 0
|
845 |
+
},
|
846 |
+
"text/plain": [
|
847 |
+
"Map: 0%| | 0/24 [00:00<?, ? examples/s]"
|
848 |
+
]
|
849 |
+
},
|
850 |
+
"metadata": {},
|
851 |
+
"output_type": "display_data"
|
852 |
+
},
|
853 |
+
{
|
854 |
+
"data": {
|
855 |
+
"application/vnd.jupyter.widget-view+json": {
|
856 |
+
"model_id": "0595b2ee75a44cb48f4fef0fbcb75752",
|
857 |
+
"version_major": 2,
|
858 |
+
"version_minor": 0
|
859 |
+
},
|
860 |
+
"text/plain": [
|
861 |
+
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
|
862 |
+
]
|
863 |
+
},
|
864 |
+
"metadata": {},
|
865 |
+
"output_type": "display_data"
|
866 |
+
},
|
867 |
+
{
|
868 |
+
"data": {
|
869 |
+
"application/vnd.jupyter.widget-view+json": {
|
870 |
+
"model_id": "2e42a9acaf544f6a9dbd7cd3fcb3d381",
|
871 |
+
"version_major": 2,
|
872 |
+
"version_minor": 0
|
873 |
+
},
|
874 |
+
"text/plain": [
|
875 |
+
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
|
876 |
+
]
|
877 |
+
},
|
878 |
+
"metadata": {},
|
879 |
+
"output_type": "display_data"
|
880 |
+
},
|
881 |
+
{
|
882 |
+
"data": {
|
883 |
+
"application/vnd.jupyter.widget-view+json": {
|
884 |
+
"model_id": "6252b1b7f45a4b779ac77748a997256f",
|
885 |
+
"version_major": 2,
|
886 |
+
"version_minor": 0
|
887 |
+
},
|
888 |
+
"text/plain": [
|
889 |
+
"Downloading data: 0%| | 0.00/5.38k [00:00<?, ?B/s]"
|
890 |
+
]
|
891 |
+
},
|
892 |
+
"metadata": {},
|
893 |
+
"output_type": "display_data"
|
894 |
+
},
|
895 |
+
{
|
896 |
+
"data": {
|
897 |
+
"application/vnd.jupyter.widget-view+json": {
|
898 |
+
"model_id": "10d2c9d09da54646a6ea06ff09622ad0",
|
899 |
+
"version_major": 2,
|
900 |
+
"version_minor": 0
|
901 |
+
},
|
902 |
+
"text/plain": [
|
903 |
+
"Generating train split: 0%| | 0/25 [00:00<?, ? examples/s]"
|
904 |
+
]
|
905 |
+
},
|
906 |
+
"metadata": {},
|
907 |
+
"output_type": "display_data"
|
908 |
+
},
|
909 |
+
{
|
910 |
+
"data": {
|
911 |
+
"application/vnd.jupyter.widget-view+json": {
|
912 |
+
"model_id": "ee28014824554610903cb642534c6cfe",
|
913 |
+
"version_major": 2,
|
914 |
+
"version_minor": 0
|
915 |
+
},
|
916 |
+
"text/plain": [
|
917 |
+
"Map: 0%| | 0/25 [00:00<?, ? examples/s]"
|
918 |
+
]
|
919 |
+
},
|
920 |
+
"metadata": {},
|
921 |
+
"output_type": "display_data"
|
922 |
+
},
|
923 |
+
{
|
924 |
+
"data": {
|
925 |
+
"application/vnd.jupyter.widget-view+json": {
|
926 |
+
"model_id": "c26ea784ed514f6382898cdd82e629c0",
|
927 |
+
"version_major": 2,
|
928 |
+
"version_minor": 0
|
929 |
+
},
|
930 |
+
"text/plain": [
|
931 |
+
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
|
932 |
+
]
|
933 |
+
},
|
934 |
+
"metadata": {},
|
935 |
+
"output_type": "display_data"
|
936 |
+
},
|
937 |
+
{
|
938 |
+
"data": {
|
939 |
+
"application/vnd.jupyter.widget-view+json": {
|
940 |
+
"model_id": "6470a8eba6cc45e1b266f160e07ac5f9",
|
941 |
+
"version_major": 2,
|
942 |
+
"version_minor": 0
|
943 |
+
},
|
944 |
+
"text/plain": [
|
945 |
+
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
|
946 |
+
]
|
947 |
+
},
|
948 |
+
"metadata": {},
|
949 |
+
"output_type": "display_data"
|
950 |
+
},
|
951 |
+
{
|
952 |
+
"data": {
|
953 |
+
"application/vnd.jupyter.widget-view+json": {
|
954 |
+
"model_id": "3e4d136bda254528993b54d6c3b5395a",
|
955 |
+
"version_major": 2,
|
956 |
+
"version_minor": 0
|
957 |
+
},
|
958 |
+
"text/plain": [
|
959 |
+
"README.md: 0%| | 0.00/599 [00:00<?, ?B/s]"
|
960 |
+
]
|
961 |
+
},
|
962 |
+
"metadata": {},
|
963 |
+
"output_type": "display_data"
|
964 |
+
},
|
965 |
+
{
|
966 |
+
"data": {
|
967 |
+
"application/vnd.jupyter.widget-view+json": {
|
968 |
+
"model_id": "23978739fcbf40ffb24f78c4e99088bc",
|
969 |
+
"version_major": 2,
|
970 |
+
"version_minor": 0
|
971 |
+
},
|
972 |
+
"text/plain": [
|
973 |
+
"Downloading data: 0%| | 0.00/13.8k [00:00<?, ?B/s]"
|
974 |
+
]
|
975 |
+
},
|
976 |
+
"metadata": {},
|
977 |
+
"output_type": "display_data"
|
978 |
+
},
|
979 |
+
{
|
980 |
+
"data": {
|
981 |
+
"application/vnd.jupyter.widget-view+json": {
|
982 |
+
"model_id": "d4bcf66919fb4444b5c2e4f489a57dc2",
|
983 |
+
"version_major": 2,
|
984 |
+
"version_minor": 0
|
985 |
+
},
|
986 |
+
"text/plain": [
|
987 |
+
"Generating train split: 0%| | 0/25 [00:00<?, ? examples/s]"
|
988 |
+
]
|
989 |
+
},
|
990 |
+
"metadata": {},
|
991 |
+
"output_type": "display_data"
|
992 |
+
},
|
993 |
+
{
|
994 |
+
"data": {
|
995 |
+
"application/vnd.jupyter.widget-view+json": {
|
996 |
+
"model_id": "a3f4915f12a047daabdcbf50eb133131",
|
997 |
+
"version_major": 2,
|
998 |
+
"version_minor": 0
|
999 |
+
},
|
1000 |
+
"text/plain": [
|
1001 |
+
"Map: 0%| | 0/25 [00:00<?, ? examples/s]"
|
1002 |
+
]
|
1003 |
+
},
|
1004 |
+
"metadata": {},
|
1005 |
+
"output_type": "display_data"
|
1006 |
+
},
|
1007 |
+
{
|
1008 |
+
"data": {
|
1009 |
+
"application/vnd.jupyter.widget-view+json": {
|
1010 |
+
"model_id": "4c628855a75447fe812638f60d81e259",
|
1011 |
+
"version_major": 2,
|
1012 |
+
"version_minor": 0
|
1013 |
+
},
|
1014 |
+
"text/plain": [
|
1015 |
+
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
|
1016 |
+
]
|
1017 |
+
},
|
1018 |
+
"metadata": {},
|
1019 |
+
"output_type": "display_data"
|
1020 |
+
},
|
1021 |
+
{
|
1022 |
+
"data": {
|
1023 |
+
"application/vnd.jupyter.widget-view+json": {
|
1024 |
+
"model_id": "3ec49e43954d43e4bab6581afb5c3b95",
|
1025 |
+
"version_major": 2,
|
1026 |
+
"version_minor": 0
|
1027 |
+
},
|
1028 |
+
"text/plain": [
|
1029 |
+
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
|
1030 |
+
]
|
1031 |
+
},
|
1032 |
+
"metadata": {},
|
1033 |
+
"output_type": "display_data"
|
1034 |
+
},
|
1035 |
+
{
|
1036 |
+
"data": {
|
1037 |
+
"application/vnd.jupyter.widget-view+json": {
|
1038 |
+
"model_id": "be12c10f2f4846a49ad71670bfbf7df3",
|
1039 |
+
"version_major": 2,
|
1040 |
+
"version_minor": 0
|
1041 |
+
},
|
1042 |
+
"text/plain": [
|
1043 |
+
"README.md: 0%| | 0.00/1.17k [00:00<?, ?B/s]"
|
1044 |
+
]
|
1045 |
+
},
|
1046 |
+
"metadata": {},
|
1047 |
+
"output_type": "display_data"
|
1048 |
+
},
|
1049 |
+
{
|
1050 |
+
"data": {
|
1051 |
+
"application/vnd.jupyter.widget-view+json": {
|
1052 |
+
"model_id": "85b48f1ca1414aed9d24c09b7ee64ae8",
|
1053 |
+
"version_major": 2,
|
1054 |
+
"version_minor": 0
|
1055 |
+
},
|
1056 |
+
"text/plain": [
|
1057 |
+
"Downloading data: 0%| | 0.00/5.74k [00:00<?, ?B/s]"
|
1058 |
+
]
|
1059 |
+
},
|
1060 |
+
"metadata": {},
|
1061 |
+
"output_type": "display_data"
|
1062 |
+
},
|
1063 |
+
{
|
1064 |
+
"data": {
|
1065 |
+
"application/vnd.jupyter.widget-view+json": {
|
1066 |
+
"model_id": "66f25ed30a7a47eba1f62c59a5ef6b1a",
|
1067 |
+
"version_major": 2,
|
1068 |
+
"version_minor": 0
|
1069 |
+
},
|
1070 |
+
"text/plain": [
|
1071 |
+
"Generating train split: 0%| | 0/25 [00:00<?, ? examples/s]"
|
1072 |
+
]
|
1073 |
+
},
|
1074 |
+
"metadata": {},
|
1075 |
+
"output_type": "display_data"
|
1076 |
+
},
|
1077 |
+
{
|
1078 |
+
"data": {
|
1079 |
+
"application/vnd.jupyter.widget-view+json": {
|
1080 |
+
"model_id": "0b1856a42f844ab286c628a5ab48ba6c",
|
1081 |
+
"version_major": 2,
|
1082 |
+
"version_minor": 0
|
1083 |
+
},
|
1084 |
+
"text/plain": [
|
1085 |
+
"Map: 0%| | 0/25 [00:00<?, ? examples/s]"
|
1086 |
+
]
|
1087 |
+
},
|
1088 |
+
"metadata": {},
|
1089 |
+
"output_type": "display_data"
|
1090 |
+
},
|
1091 |
+
{
|
1092 |
+
"data": {
|
1093 |
+
"application/vnd.jupyter.widget-view+json": {
|
1094 |
+
"model_id": "06b2892ed4da4846baee7b178c42fa40",
|
1095 |
+
"version_major": 2,
|
1096 |
+
"version_minor": 0
|
1097 |
+
},
|
1098 |
+
"text/plain": [
|
1099 |
+
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
|
1100 |
+
]
|
1101 |
+
},
|
1102 |
+
"metadata": {},
|
1103 |
+
"output_type": "display_data"
|
1104 |
+
},
|
1105 |
+
{
|
1106 |
+
"data": {
|
1107 |
+
"application/vnd.jupyter.widget-view+json": {
|
1108 |
+
"model_id": "92fb9cf2d4bf4b2abf9412d9238bc769",
|
1109 |
+
"version_major": 2,
|
1110 |
+
"version_minor": 0
|
1111 |
+
},
|
1112 |
+
"text/plain": [
|
1113 |
+
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
|
1114 |
+
]
|
1115 |
+
},
|
1116 |
+
"metadata": {},
|
1117 |
+
"output_type": "display_data"
|
1118 |
+
},
|
1119 |
+
{
|
1120 |
+
"data": {
|
1121 |
+
"application/vnd.jupyter.widget-view+json": {
|
1122 |
+
"model_id": "36914c848e654dee81683a379721b8c9",
|
1123 |
+
"version_major": 2,
|
1124 |
+
"version_minor": 0
|
1125 |
+
},
|
1126 |
+
"text/plain": [
|
1127 |
+
"README.md: 0%| | 0.00/1.74k [00:00<?, ?B/s]"
|
1128 |
+
]
|
1129 |
+
},
|
1130 |
+
"metadata": {},
|
1131 |
+
"output_type": "display_data"
|
1132 |
+
},
|
1133 |
+
{
|
1134 |
+
"data": {
|
1135 |
+
"application/vnd.jupyter.widget-view+json": {
|
1136 |
+
"model_id": "6b23b1c80d1349c68fea02d8bb4baee1",
|
1137 |
+
"version_major": 2,
|
1138 |
+
"version_minor": 0
|
1139 |
+
},
|
1140 |
+
"text/plain": [
|
1141 |
+
"Downloading data: 0%| | 0.00/10.9k [00:00<?, ?B/s]"
|
1142 |
+
]
|
1143 |
+
},
|
1144 |
+
"metadata": {},
|
1145 |
+
"output_type": "display_data"
|
1146 |
+
},
|
1147 |
+
{
|
1148 |
+
"data": {
|
1149 |
+
"application/vnd.jupyter.widget-view+json": {
|
1150 |
+
"model_id": "0b91c12b7b6c4d88a32504e52be379b3",
|
1151 |
+
"version_major": 2,
|
1152 |
+
"version_minor": 0
|
1153 |
+
},
|
1154 |
+
"text/plain": [
|
1155 |
+
"Generating train split: 0%| | 0/25 [00:00<?, ? examples/s]"
|
1156 |
+
]
|
1157 |
+
},
|
1158 |
+
"metadata": {},
|
1159 |
+
"output_type": "display_data"
|
1160 |
+
},
|
1161 |
+
{
|
1162 |
+
"data": {
|
1163 |
+
"application/vnd.jupyter.widget-view+json": {
|
1164 |
+
"model_id": "188ee5373fbc4b2c845ae823b073daca",
|
1165 |
+
"version_major": 2,
|
1166 |
+
"version_minor": 0
|
1167 |
+
},
|
1168 |
+
"text/plain": [
|
1169 |
+
"Map: 0%| | 0/25 [00:00<?, ? examples/s]"
|
1170 |
+
]
|
1171 |
+
},
|
1172 |
+
"metadata": {},
|
1173 |
+
"output_type": "display_data"
|
1174 |
+
},
|
1175 |
+
{
|
1176 |
+
"data": {
|
1177 |
+
"application/vnd.jupyter.widget-view+json": {
|
1178 |
+
"model_id": "91b7e4137f50487d927dabc859d9f19f",
|
1179 |
+
"version_major": 2,
|
1180 |
+
"version_minor": 0
|
1181 |
+
},
|
1182 |
+
"text/plain": [
|
1183 |
+
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
|
1184 |
+
]
|
1185 |
+
},
|
1186 |
+
"metadata": {},
|
1187 |
+
"output_type": "display_data"
|
1188 |
+
},
|
1189 |
+
{
|
1190 |
+
"data": {
|
1191 |
+
"application/vnd.jupyter.widget-view+json": {
|
1192 |
+
"model_id": "d2b2d76b482f4fe0a7dcf1ff4cfc1168",
|
1193 |
+
"version_major": 2,
|
1194 |
+
"version_minor": 0
|
1195 |
+
},
|
1196 |
+
"text/plain": [
|
1197 |
+
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
|
1198 |
+
]
|
1199 |
+
},
|
1200 |
+
"metadata": {},
|
1201 |
+
"output_type": "display_data"
|
1202 |
+
},
|
1203 |
+
{
|
1204 |
+
"data": {
|
1205 |
+
"application/vnd.jupyter.widget-view+json": {
|
1206 |
+
"model_id": "4fff6b9680244b85a572cf6373781cec",
|
1207 |
+
"version_major": 2,
|
1208 |
+
"version_minor": 0
|
1209 |
+
},
|
1210 |
+
"text/plain": [
|
1211 |
+
"README.md: 0%| | 0.00/2.34k [00:00<?, ?B/s]"
|
1212 |
+
]
|
1213 |
+
},
|
1214 |
+
"metadata": {},
|
1215 |
+
"output_type": "display_data"
|
1216 |
+
},
|
1217 |
+
{
|
1218 |
+
"data": {
|
1219 |
+
"application/vnd.jupyter.widget-view+json": {
|
1220 |
+
"model_id": "21baa00c99e2485d8af5edaf095dfc46",
|
1221 |
+
"version_major": 2,
|
1222 |
+
"version_minor": 0
|
1223 |
+
},
|
1224 |
+
"text/plain": [
|
1225 |
+
"Downloading data: 0%| | 0.00/16.5k [00:00<?, ?B/s]"
|
1226 |
+
]
|
1227 |
+
},
|
1228 |
+
"metadata": {},
|
1229 |
+
"output_type": "display_data"
|
1230 |
+
},
|
1231 |
+
{
|
1232 |
+
"data": {
|
1233 |
+
"application/vnd.jupyter.widget-view+json": {
|
1234 |
+
"model_id": "610e87cdb9a44e91b8eb1c0dbe4af60f",
|
1235 |
+
"version_major": 2,
|
1236 |
+
"version_minor": 0
|
1237 |
+
},
|
1238 |
+
"text/plain": [
|
1239 |
+
"Generating train split: 0%| | 0/24 [00:00<?, ? examples/s]"
|
1240 |
+
]
|
1241 |
+
},
|
1242 |
+
"metadata": {},
|
1243 |
+
"output_type": "display_data"
|
1244 |
+
},
|
1245 |
+
{
|
1246 |
+
"data": {
|
1247 |
+
"application/vnd.jupyter.widget-view+json": {
|
1248 |
+
"model_id": "d7fe56bef4fd490fa18ad574cc72ba77",
|
1249 |
+
"version_major": 2,
|
1250 |
+
"version_minor": 0
|
1251 |
+
},
|
1252 |
+
"text/plain": [
|
1253 |
+
"Map: 0%| | 0/24 [00:00<?, ? examples/s]"
|
1254 |
+
]
|
1255 |
+
},
|
1256 |
+
"metadata": {},
|
1257 |
+
"output_type": "display_data"
|
1258 |
+
},
|
1259 |
+
{
|
1260 |
+
"data": {
|
1261 |
+
"application/vnd.jupyter.widget-view+json": {
|
1262 |
+
"model_id": "14d8c2e7c9144f62ab6d5ba3dae7e96f",
|
1263 |
+
"version_major": 2,
|
1264 |
+
"version_minor": 0
|
1265 |
+
},
|
1266 |
+
"text/plain": [
|
1267 |
+
"Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
|
1268 |
+
]
|
1269 |
+
},
|
1270 |
+
"metadata": {},
|
1271 |
+
"output_type": "display_data"
|
1272 |
+
},
|
1273 |
+
{
|
1274 |
+
"data": {
|
1275 |
+
"application/vnd.jupyter.widget-view+json": {
|
1276 |
+
"model_id": "eb21d60a8bf940e797a432469b74d262",
|
1277 |
+
"version_major": 2,
|
1278 |
+
"version_minor": 0
|
1279 |
+
},
|
1280 |
+
"text/plain": [
|
1281 |
+
"Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
|
1282 |
+
]
|
1283 |
+
},
|
1284 |
+
"metadata": {},
|
1285 |
+
"output_type": "display_data"
|
1286 |
+
},
|
1287 |
+
{
|
1288 |
+
"data": {
|
1289 |
+
"application/vnd.jupyter.widget-view+json": {
|
1290 |
+
"model_id": "0cc785b4868143f99ffe04a0f9239692",
|
1291 |
+
"version_major": 2,
|
1292 |
+
"version_minor": 0
|
1293 |
+
},
|
1294 |
+
"text/plain": [
|
1295 |
+
"README.md: 0%| | 0.00/2.91k [00:00<?, ?B/s]"
|
1296 |
+
]
|
1297 |
+
},
|
1298 |
+
"metadata": {},
|
1299 |
+
"output_type": "display_data"
|
1300 |
+
}
|
1301 |
+
],
|
1302 |
+
"source": [
|
1303 |
+
"from datasets import load_dataset\n",
|
1304 |
+
"\n",
|
1305 |
+
"subsets = ['ELRC-24ss', 'GNOME-25ss', 'HPLT-25ss', 'OpenSubtitles-25ss', 'TED2020-25ss', 'UNPC-24ss']\n",
|
1306 |
+
"\n",
|
1307 |
+
"# Iterate over each subset\n",
|
1308 |
+
"for subset in subsets:\n",
|
1309 |
+
" # Load the dataset for the specific subset\n",
|
1310 |
+
" dataset = load_dataset(\"arabic-translation-prompt-engineering/TpDwD\", subset)\n",
|
1311 |
+
"\n",
|
1312 |
+
" # Rename the columns\n",
|
1313 |
+
" dataset = dataset.rename_column(\"ar_text\", \"human_translation\")\n",
|
1314 |
+
" dataset = dataset.rename_column(\"en_text\", \"source_text\")\n",
|
1315 |
+
"\n",
|
1316 |
+
" # Apply functions to add new columns\n",
|
1317 |
+
" dataset = dataset.map(lambda example: {\n",
|
1318 |
+
" \"baseline_translation\": baseline_chat_completion(example['source_text']).choices[0].message.content,\n",
|
1319 |
+
" \"purpose_driven_translation\": purpose_driven_chat_completion(example['source_text'], subset).choices[0].message.tool_calls[0].function.arguments['translation'],\n",
|
1320 |
+
" \"automatic_purpose_driven_translation\": automatic_purpose_driven_chat(example['source_text']).choices[0].message.content\n",
|
1321 |
+
" })\n",
|
1322 |
+
" \n",
|
1323 |
+
" # Push the processed dataset to the Hub\n",
|
1324 |
+
" dataset.push_to_hub(f\"arabic-translation-prompt-engineering/TpDwD_translated\",subset)\n"
|
1325 |
+
]
|
1326 |
+
},
|
1327 |
+
{
|
1328 |
+
"cell_type": "markdown",
|
1329 |
+
"id": "1ec3b20b-8393-4fda-a51d-cf67984cc166",
|
1330 |
+
"metadata": {
|
1331 |
+
"id": "1ec3b20b-8393-4fda-a51d-cf67984cc166"
|
1332 |
+
},
|
1333 |
+
"source": [
|
1334 |
+
"# Push to the hub"
|
1335 |
+
]
|
1336 |
+
},
|
1337 |
+
{
|
1338 |
+
"cell_type": "code",
|
1339 |
+
"execution_count": null,
|
1340 |
+
"id": "6fc55725-216f-45dd-9c6d-dae77e16d606",
|
1341 |
+
"metadata": {
|
1342 |
+
"id": "6fc55725-216f-45dd-9c6d-dae77e16d606",
|
1343 |
+
"outputId": "d3b9b994-e3af-4f05-88b4-3b29778d1dc7",
|
1344 |
+
"tags": []
|
1345 |
+
},
|
1346 |
+
"outputs": [],
|
1347 |
"source": [
|
1348 |
"from huggingface_hub import HfApi\n",
|
1349 |
"\n",
|