derek-thomas HF staff commited on
Commit
66148ed
1 Parent(s): cfb4a98

Adding dataset creation

Browse files
Files changed (1) hide show
  1. translate-prompts.ipynb +730 -122
translate-prompts.ipynb CHANGED
@@ -13,6 +13,18 @@
13
  {
14
  "cell_type": "code",
15
  "execution_count": null,
 
 
 
 
 
 
 
 
 
 
 
 
16
  "id": "b45bd52f-03e9-419f-8110-1013ff45fb1b",
17
  "metadata": {
18
  "id": "b45bd52f-03e9-419f-8110-1013ff45fb1b",
@@ -20,12 +32,13 @@
20
  },
21
  "outputs": [],
22
  "source": [
23
- "from huggingface_hub import InferenceClient, login"
 
24
  ]
25
  },
26
  {
27
  "cell_type": "code",
28
- "execution_count": null,
29
  "id": "dc9f0411-8bf2-4a20-a6ea-331a2a486b8e",
30
  "metadata": {
31
  "colab": {
@@ -41,7 +54,7 @@
41
  {
42
  "data": {
43
  "application/vnd.jupyter.widget-view+json": {
44
- "model_id": "515c96c357454fdc9a38ecc995ff1b3d",
45
  "version_major": 2,
46
  "version_minor": 0
47
  },
@@ -76,14 +89,16 @@
76
  },
77
  {
78
  "cell_type": "code",
79
- "execution_count": null,
80
  "id": "84e6cb89-30d3-4ef5-8063-07783798e045",
81
  "metadata": {
82
- "id": "84e6cb89-30d3-4ef5-8063-07783798e045"
 
83
  },
84
  "outputs": [],
85
  "source": [
86
  "MODEL = \"CohereForAI/c4ai-command-r-plus\"\n",
 
87
  "client = InferenceClient(MODEL)"
88
  ]
89
  },
@@ -97,7 +112,13 @@
97
  "# Translation\n",
98
  "Our goal is to explore translation between English and Arabic and how prompt engineering can impact it. There has been [some work](https://arxiv.org/pdf/2308.01391), but we didn't find as much as we were hoping, especially for open source models.\n",
99
  "\n",
100
- "We have created a dataset across 6 domains and want to compare each method by having human rankers. We also have human translations to ground these rankings."
 
 
 
 
 
 
101
  ]
102
  },
103
  {
@@ -127,7 +148,7 @@
127
  },
128
  {
129
  "cell_type": "code",
130
- "execution_count": null,
131
  "id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
132
  "metadata": {
133
  "id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
@@ -135,7 +156,7 @@
135
  },
136
  "outputs": [],
137
  "source": [
138
- "system_prompt = \"\"\"You are a skilled translator with extensive experience in English and Arabic translations. You possess a deep understanding of the linguistic, cultural, and contextual nuances essential for accurate and effective translation between these languages. Highly motivated and detail-oriented, you are committed to delivering translations that maintain the integrity and intent of the original text. Your role is crucial in ensuring clear and precise communication in our multilingual system.\"\"\""
139
  ]
140
  },
141
  {
@@ -146,7 +167,7 @@
146
  },
147
  "source": [
148
  "### Instruction\n",
149
- "> Translate this from arabic to english: {translation_input}.\n",
150
  ">\n",
151
  "> Translation:\n",
152
  "\n",
@@ -155,19 +176,20 @@
155
  },
156
  {
157
  "cell_type": "code",
158
- "execution_count": null,
159
  "id": "b7f1722c-c484-4e22-a025-53f95943fc76",
160
  "metadata": {
161
- "id": "b7f1722c-c484-4e22-a025-53f95943fc76"
 
162
  },
163
  "outputs": [],
164
  "source": [
165
- "def baseline_chat_completion(system_prompt, translation_input):\n",
166
  " \"\"\"\n",
167
  " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
168
  " \"\"\"\n",
169
  " messages = [\n",
170
- " {\"role\": \"system\", \"content\": system_prompt},\n",
171
  " {\n",
172
  " \"role\": \"user\",\n",
173
  " \"content\": f\"Translate this from english to arabic: {translation_input}.\\nTranslation: \",\n",
@@ -178,7 +200,7 @@
178
  },
179
  {
180
  "cell_type": "code",
181
- "execution_count": null,
182
  "id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
183
  "metadata": {
184
  "id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
@@ -188,7 +210,6 @@
188
  "source": [
189
  "translation_input = \"Float like a butterfly sting like a bee – his hands can’t hit what his eyes can’t see.\"\n",
190
  "response = baseline_chat_completion(\n",
191
- " system_prompt,\n",
192
  " translation_input,\n",
193
  ")"
194
  ]
@@ -201,23 +222,21 @@
201
  },
202
  "source": [
203
  "### Token Cost\n",
204
- "Here we can see that the cost is quite cheap, only 92 tokens!"
205
  ]
206
  },
207
  {
208
  "cell_type": "code",
209
- "execution_count": null,
210
- "id": "4e305b1e-56e0-44da-8c17-496cbcc35fad",
211
  "metadata": {
212
- "id": "4e305b1e-56e0-44da-8c17-496cbcc35fad",
213
- "outputId": "a97a8a75-e2e6-4ac3-e7ac-db674a1f46c9",
214
  "tags": []
215
  },
216
  "outputs": [
217
  {
218
  "data": {
219
  "text/plain": [
220
- "120"
221
  ]
222
  },
223
  "execution_count": 7,
@@ -226,12 +245,12 @@
226
  }
227
  ],
228
  "source": [
229
- "response.usage.prompt_tokens"
230
  ]
231
  },
232
  {
233
  "cell_type": "code",
234
- "execution_count": null,
235
  "id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
236
  "metadata": {
237
  "id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
@@ -243,7 +262,7 @@
243
  "name": "stdout",
244
  "output_type": "stream",
245
  "text": [
246
- "يطير كالفراشة ويلسع كالنحلة - لا يمكن ليديه أن تصيبا ما لا تستطيع عيناه رؤيته.\n"
247
  ]
248
  }
249
  ],
@@ -278,7 +297,7 @@
278
  },
279
  {
280
  "cell_type": "code",
281
- "execution_count": null,
282
  "id": "4714f6a2-fd0b-48ee-80bc-860f40ee2baa",
283
  "metadata": {
284
  "id": "4714f6a2-fd0b-48ee-80bc-860f40ee2baa",
@@ -316,37 +335,67 @@
316
  },
317
  {
318
  "cell_type": "code",
319
- "execution_count": null,
320
- "id": "f865e9f8-7c63-4e72-b539-0d5916eda44f",
321
  "metadata": {
322
- "id": "f865e9f8-7c63-4e72-b539-0d5916eda44f"
323
  },
324
  "outputs": [],
325
  "source": [
326
- "def purpose_driven_chat_completion(system_prompt, translation_input, dataset):\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  " \"\"\"\n",
328
- " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
 
329
  " \"\"\"\n",
 
 
 
 
 
 
 
 
 
330
  "\n",
331
- " prompt = f\"\"\"Translate the following English [source text] into Arabic. Please fulfill the following conditions when translating.\n",
332
- "Purpose of the translation: {dataset_to_purpose_target[dataset]['purpose']}\n",
333
- "Target audience: {dataset_to_purpose_target[dataset]['audience']}\n",
334
- "[source text] `{translation_input}`\n",
335
- "[translated text] \"\"\"\n",
336
- "\n",
337
  " messages = [\n",
338
- " {\"role\": \"system\", \"content\": system_prompt},\n",
339
  " {\n",
340
  " \"role\": \"user\",\n",
341
  " \"content\": prompt,\n",
342
  " },\n",
343
  " ]\n",
344
- " return client.chat_completion(messages, max_tokens=10_000)"
 
 
 
345
  ]
346
  },
347
  {
348
  "cell_type": "code",
349
- "execution_count": null,
350
  "id": "4115515d-cbcd-405a-b2e0-a805880a40c4",
351
  "metadata": {
352
  "id": "4115515d-cbcd-405a-b2e0-a805880a40c4",
@@ -355,23 +404,21 @@
355
  "outputs": [],
356
  "source": [
357
  "translation_input = \"We have observed that when groups of stakeholders work to define … visions, this leads to debate over whether to emphasize ecosystem health or human well-being … Whether the priority is ecosystems or people greatly influences stakeholders' assessment of desirable ecological and social states.\"\n",
358
- "response = purpose_driven_chat_completion(system_prompt, translation_input, \"ELRC-24ss\")"
359
  ]
360
  },
361
  {
362
  "cell_type": "code",
363
- "execution_count": null,
364
- "id": "6296d255-d11d-4df7-aa0e-1226ef3d963a",
365
  "metadata": {
366
- "id": "6296d255-d11d-4df7-aa0e-1226ef3d963a",
367
- "outputId": "45282f39-18ef-4dc0-f708-1113ca1769ce",
368
  "tags": []
369
  },
370
  "outputs": [
371
  {
372
  "data": {
373
  "text/plain": [
374
- "208"
375
  ]
376
  },
377
  "execution_count": 12,
@@ -380,12 +427,12 @@
380
  }
381
  ],
382
  "source": [
383
- "response.usage.prompt_tokens"
384
  ]
385
  },
386
  {
387
  "cell_type": "code",
388
- "execution_count": null,
389
  "id": "1f1c6dd0-11bf-4b88-9029-8bce1e7bcb1c",
390
  "metadata": {
391
  "id": "1f1c6dd0-11bf-4b88-9029-8bce1e7bcb1c",
@@ -397,12 +444,18 @@
397
  "name": "stdout",
398
  "output_type": "stream",
399
  "text": [
400
- "\"لقد لاحظنا أنه عندما تعمل مجموعات أصحاب المصلحة على تحديد ... الرؤى، فإن هذا يؤدي إلى نقاش حول ما إذا كان ينبغي التركيز على صحة النظام البيئي أو رفاهية الإنسان ... إن معرفة ما إذا كانت الأولوية للنظم البيئية أو للبشر تؤثر بشكل كبير على تقييم أصحاب المصلحة للحالات البيئية والاجتماعية المرغوبة.\"\n"
 
 
 
 
401
  ]
402
  }
403
  ],
404
  "source": [
405
- "print(response.choices[0].message.content)"
 
 
406
  ]
407
  },
408
  {
@@ -434,13 +487,15 @@
434
  "id": "2f5deb21-18fb-4c5b-9045-c7fe5e751c05"
435
  },
436
  "source": [
437
- "Its usually helpful if we tell the LLM what we want to create when we prompt it. I have found using the tags like `<source text>` can be really useful to denote what you are specifying.\n",
438
  "\n",
439
- "> I want to translate the following \\<source text\\> from English into Arabic. But first I want to create a json that includes the following: \n",
440
- "{\"intent\": \"\", \"subject\": \"\", \"assumptions relating to content\": \"\", \"purpose\": \"\", \"target audience\": \"\"}. \n",
441
- "Can you fill this out and be specific to how this can help you translate in the next step? No need to translate yet! \n",
442
- "\\<source text\\> \n",
443
- "\\</source text\\> "
 
 
444
  ]
445
  },
446
  {
@@ -455,7 +510,7 @@
455
  },
456
  {
457
  "cell_type": "code",
458
- "execution_count": null,
459
  "id": "affe3668-aa37-47b5-be37-a0bd5dabab56",
460
  "metadata": {
461
  "id": "affe3668-aa37-47b5-be37-a0bd5dabab56",
@@ -463,7 +518,7 @@
463
  },
464
  "outputs": [],
465
  "source": [
466
- "translation_tools = [\n",
467
  " {\n",
468
  " \"type\": \"function\",\n",
469
  " \"function\": {\n",
@@ -503,38 +558,39 @@
503
  },
504
  {
505
  "cell_type": "code",
506
- "execution_count": null,
507
  "id": "b6436ce6-03af-4206-a283-0c2ecd17bd88",
508
  "metadata": {
509
- "id": "b6436ce6-03af-4206-a283-0c2ecd17bd88"
 
510
  },
511
  "outputs": [],
512
  "source": [
513
- "def tool_call_chat_completion(system_prompt, translation_input, tools):\n",
514
  " \"\"\"\n",
515
  " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
516
  " \"\"\"\n",
517
  "\n",
518
- " prompt = f\"\"\"I want to translate the following <source text> from English into Arabic. But first I want to create a json that includes the following:\n",
519
  "{{\"subject\": \"\", \"assumptions relating to content\": \"\", \"purpose\": \"\", \"target audience\": \"\"}}.\n",
520
  "Can you fill this out and be specific to how this can help you translate in the next step? No need to translate yet!\n",
521
- "<source text>\n",
522
- "{translation_input}\n",
523
- "</source text>\n",
524
  "\"\"\"\n",
525
  " messages = [\n",
526
- " {\"role\": \"system\", \"content\": system_prompt},\n",
527
  " {\n",
528
  " \"role\": \"user\",\n",
529
  " \"content\": prompt,\n",
530
  " },\n",
531
  " ]\n",
532
- " return client.chat_completion(messages, max_tokens=10_000, tools=tools, tool_choice='get_translation_audience_purpose')"
533
  ]
534
  },
535
  {
536
  "cell_type": "code",
537
- "execution_count": null,
538
  "id": "a731b2c0-54a3-4b8e-83f6-1663c759cf79",
539
  "metadata": {
540
  "id": "a731b2c0-54a3-4b8e-83f6-1663c759cf79",
@@ -543,13 +599,13 @@
543
  "outputs": [],
544
  "source": [
545
  "translation_input = \"We have observed that when groups of stakeholders work to define … visions, this leads to debate over whether to emphasize ecosystem health or human well-being … Whether the priority is ecosystems or people greatly influences stakeholders' assessment of desirable ecological and social states.\"\n",
546
- "response = tool_call_chat_completion(system_prompt, translation_input, translation_tools)"
547
  ]
548
  },
549
  {
550
  "cell_type": "code",
551
- "execution_count": null,
552
- "id": "07e0f133-f6e1-4bc6-969c-da9d36bfba2f",
553
  "metadata": {
554
  "id": "07e0f133-f6e1-4bc6-969c-da9d36bfba2f",
555
  "outputId": "82e2506a-7336-4909-b8e4-fc52f671c511",
@@ -559,7 +615,7 @@
559
  {
560
  "data": {
561
  "text/plain": [
562
- "454"
563
  ]
564
  },
565
  "execution_count": 17,
@@ -568,13 +624,12 @@
568
  }
569
  ],
570
  "source": [
571
- "function_call_tokens = response.usage.prompt_tokens\n",
572
- "function_call_tokens"
573
  ]
574
  },
575
  {
576
  "cell_type": "code",
577
- "execution_count": null,
578
  "id": "02be1827-7137-463f-a026-0b26dec6f552",
579
  "metadata": {
580
  "id": "02be1827-7137-463f-a026-0b26dec6f552",
@@ -586,22 +641,21 @@
586
  "name": "stdout",
587
  "output_type": "stream",
588
  "text": [
589
- "{'assumptions relating to the content': 'The text assumes that the readers '\n",
590
- " 'understand the concept of stakeholder '\n",
591
- " 'groups and their potential differing '\n",
592
- " 'priorities regarding environmental '\n",
593
- " 'and social issues.',\n",
594
- " 'audience': 'The target audience for this text is likely individuals or '\n",
595
- " 'organizations involved in sustainability, ecology, or social '\n",
596
- " 'welfare fields, as well as stakeholders with an interest in '\n",
597
- " 'these areas.',\n",
598
- " 'purpose': 'The purpose of this text is to highlight the observations made '\n",
599
- " 'about the debates that arise when stakeholder groups define their '\n",
600
- " 'visions, specifically regarding the priority between ecosystem '\n",
601
- " 'health and human well-being, and how this priority influences '\n",
602
- " 'their assessment of desired ecological and social outcomes.',\n",
603
- " 'subject': 'Debates on Prioritizing Ecosystem vs. Human Well-being by '\n",
604
- " 'Stakeholder Groups'}\n"
605
  ]
606
  }
607
  ],
@@ -613,38 +667,36 @@
613
  },
614
  {
615
  "cell_type": "code",
616
- "execution_count": null,
617
  "id": "7575bd09-2d20-49ae-bb10-162a0e469f16",
618
  "metadata": {
619
- "id": "7575bd09-2d20-49ae-bb10-162a0e469f16"
 
620
  },
621
  "outputs": [],
622
  "source": [
623
- "def automatic_purpose_driven_chat_completion(system_prompt, translation_input, description_json):\n",
624
  " \"\"\"\n",
625
  " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
626
  " \"\"\"\n",
627
  "\n",
628
- " prompt = f\"\"\"Given the following descriptive json translate <source text> from English to Arabic\n",
629
- "{description_json}\n",
630
- "<source text>\n",
631
- "{translation_input}\n",
632
- "</source text>\n",
633
  "Translation:\n",
634
  "\"\"\"\n",
635
  " messages = [\n",
636
- " {\"role\": \"system\", \"content\": system_prompt},\n",
637
- " {\n",
638
- " \"role\": \"user\",\n",
639
- " \"content\": prompt,\n",
640
- " },\n",
641
  " ]\n",
642
  " return client.chat_completion(messages, max_tokens=10_000)"
643
  ]
644
  },
645
  {
646
  "cell_type": "code",
647
- "execution_count": null,
648
  "id": "32bccca0-c866-4006-84d1-d5b783b73689",
649
  "metadata": {
650
  "id": "32bccca0-c866-4006-84d1-d5b783b73689",
@@ -652,12 +704,12 @@
652
  },
653
  "outputs": [],
654
  "source": [
655
- "response = automatic_purpose_driven_chat_completion(system_prompt, translation_input, description_json)"
656
  ]
657
  },
658
  {
659
  "cell_type": "code",
660
- "execution_count": null,
661
  "id": "b0867efb-39ea-4f9a-b073-2a84261f3821",
662
  "metadata": {
663
  "id": "b0867efb-39ea-4f9a-b073-2a84261f3821",
@@ -668,7 +720,7 @@
668
  {
669
  "data": {
670
  "text/plain": [
671
- "311"
672
  ]
673
  },
674
  "execution_count": 21,
@@ -677,13 +729,12 @@
677
  }
678
  ],
679
  "source": [
680
- "automatic_purpose_driven_tokens = response.usage.prompt_tokens\n",
681
- "automatic_purpose_driven_tokens"
682
  ]
683
  },
684
  {
685
  "cell_type": "code",
686
- "execution_count": null,
687
  "id": "462ca84c-9ffd-4924-880d-e06b724caf02",
688
  "metadata": {
689
  "id": "462ca84c-9ffd-4924-880d-e06b724caf02",
@@ -695,9 +746,15 @@
695
  "name": "stdout",
696
  "output_type": "stream",
697
  "text": [
698
- "Here is the translated text from English to Arabic:\n",
699
- "\n",
700
- "{\"افتراضات متعلقة بالمحتوى\": \"يفترض النص أن القراء يفهمون مفهوم مجموعات أصحاب المصلحة وأولوياتهم المختلفة المحتملة فيما يتعلق بالقضايا البيئية والاجتماعية.\", \"الجمهور\": \"الجمهور المستهدف لهذا النص هو على الأرجح أفراد أو منظمات تعمل في مجالات الاستدامة أو الإيكولوجيا أو الرعاية الاجتماعية، بالإضافة إلى أصحاب المصلحة المهتمين بهذه المجالات.\", \"الغرض\": \"الغرض من هذا النص هو تسليط الضوء على الملاحظات التي أبديت حول النقاشات التي تنشأ عندما تحدد مجموعات أصحاب المصلحة رؤيتها، خاصة فيما يتعلق بالأولوية بين صحة النظام البيئي ورفاهية الإنسان، وكيف تؤثر هذه الأولوية على تقييمها للنتائج الإيكولوجية والاجتماعية المرجوة.\", \"الموضوع\": \"النقاشات حول أولويات النظام البيئي مقابل رفاهية الإنسان من قبل مجموعات أصحاب المصلحة\": \"لقد لاحظنا أنه عندما تعمل مجموعات أصحاب المصلحة على تحديد الرؤى، فإن ذلك يؤدي إلى نقاش حول ما إذا كان ينبغي التركيز على صحة النظام البيئي أو رفاهية الإنسان... إن أولوية النظم البيئية أو البشر تؤثر بشكل كبير على تقييم أصحاب المصلحة للوضع الاجتماعي والإيكولوجي المرغوب.\"}\n"
 
 
 
 
 
 
701
  ]
702
  }
703
  ],
@@ -707,35 +764,586 @@
707
  },
708
  {
709
  "cell_type": "markdown",
710
- "id": "1ec3b20b-8393-4fda-a51d-cf67984cc166",
 
 
 
 
 
 
 
 
 
711
  "metadata": {
712
- "id": "1ec3b20b-8393-4fda-a51d-cf67984cc166"
713
  },
 
714
  "source": [
715
- "# Push to the hub"
 
 
 
716
  ]
717
  },
718
  {
719
  "cell_type": "code",
720
- "execution_count": null,
721
- "id": "6fc55725-216f-45dd-9c6d-dae77e16d606",
722
  "metadata": {
723
- "id": "6fc55725-216f-45dd-9c6d-dae77e16d606",
724
- "outputId": "d3b9b994-e3af-4f05-88b4-3b29778d1dc7",
725
  "tags": []
726
  },
727
  "outputs": [
728
  {
729
  "data": {
730
  "text/plain": [
731
- "CommitInfo(commit_url='https://huggingface.co/arabic-translation-prompt-engineering/atpe-notebooks/commit/d112cc49a99a72106b75c3eae9be453264fb488d', commit_message='Upload baseline.ipynb with huggingface_hub', commit_description='', oid='d112cc49a99a72106b75c3eae9be453264fb488d', pr_url=None, pr_revision=None, pr_num=None)"
732
  ]
733
  },
734
- "execution_count": 23,
735
  "metadata": {},
736
  "output_type": "execute_result"
737
  }
738
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
739
  "source": [
740
  "from huggingface_hub import HfApi\n",
741
  "\n",
 
13
  {
14
  "cell_type": "code",
15
  "execution_count": null,
16
+ "id": "ddb3dfde-39cc-4ad9-917e-48413add2d9b",
17
+ "metadata": {
18
+ "tags": []
19
+ },
20
+ "outputs": [],
21
+ "source": [
22
+ "%pip install -U -q transformers huggingface-hub"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 1,
28
  "id": "b45bd52f-03e9-419f-8110-1013ff45fb1b",
29
  "metadata": {
30
  "id": "b45bd52f-03e9-419f-8110-1013ff45fb1b",
 
32
  },
33
  "outputs": [],
34
  "source": [
35
+ "from huggingface_hub import InferenceClient, login\n",
36
+ "from transformers import AutoTokenizer"
37
  ]
38
  },
39
  {
40
  "cell_type": "code",
41
+ "execution_count": 2,
42
  "id": "dc9f0411-8bf2-4a20-a6ea-331a2a486b8e",
43
  "metadata": {
44
  "colab": {
 
54
  {
55
  "data": {
56
  "application/vnd.jupyter.widget-view+json": {
57
+ "model_id": "da0ae7fafffb4005a5325a53896feb82",
58
  "version_major": 2,
59
  "version_minor": 0
60
  },
 
89
  },
90
  {
91
  "cell_type": "code",
92
+ "execution_count": 3,
93
  "id": "84e6cb89-30d3-4ef5-8063-07783798e045",
94
  "metadata": {
95
+ "id": "84e6cb89-30d3-4ef5-8063-07783798e045",
96
+ "tags": []
97
  },
98
  "outputs": [],
99
  "source": [
100
  "MODEL = \"CohereForAI/c4ai-command-r-plus\"\n",
101
+ "tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)\n",
102
  "client = InferenceClient(MODEL)"
103
  ]
104
  },
 
112
  "# Translation\n",
113
  "Our goal is to explore translation between English and Arabic and how prompt engineering can impact it. There has been [some work](https://arxiv.org/pdf/2308.01391), but we didn't find as much as we were hoping, especially for open source models.\n",
114
  "\n",
115
+ "We have created a dataset [arabic-translation-prompt-engineering/TpDwD](https://huggingface.co/datasets/arabic-translation-prompt-engineering/TpDwD) across 6 domains and want to compare each method by having human rankers. We also have human translations to ground these rankings.\n",
116
+ "\n",
117
+ "We will evaluate the following methods:\n",
118
+ "- Baseline\n",
119
+ "- Manual Purpose Driven\n",
120
+ "- Automatic Purpose Driven\n",
121
+ "- Automatic Motivation Driven"
122
  ]
123
  },
124
  {
 
148
  },
149
  {
150
  "cell_type": "code",
151
+ "execution_count": 4,
152
  "id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
153
  "metadata": {
154
  "id": "032c86d2-868e-4fa6-b03e-58f1c41434cc",
 
156
  },
157
  "outputs": [],
158
  "source": [
159
+ "baseline_system_prompt = \"\"\"You are a skilled translator with extensive experience in English and Arabic translations. You possess a deep understanding of the linguistic, cultural, and contextual nuances essential for accurate and effective translation between these languages. Highly motivated and detail-oriented, you are committed to delivering translations that maintain the integrity and intent of the original text. Your role is crucial in ensuring clear and precise communication in our multilingual system.\"\"\""
160
  ]
161
  },
162
  {
 
167
  },
168
  "source": [
169
  "### Instruction\n",
170
+ "> Translate this from english to arabic: {translation_input}.\n",
171
  ">\n",
172
  "> Translation:\n",
173
  "\n",
 
176
  },
177
  {
178
  "cell_type": "code",
179
+ "execution_count": 5,
180
  "id": "b7f1722c-c484-4e22-a025-53f95943fc76",
181
  "metadata": {
182
+ "id": "b7f1722c-c484-4e22-a025-53f95943fc76",
183
+ "tags": []
184
  },
185
  "outputs": [],
186
  "source": [
187
+ "def baseline_chat_completion(translation_input):\n",
188
  " \"\"\"\n",
189
  " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
190
  " \"\"\"\n",
191
  " messages = [\n",
192
+ " {\"role\": \"system\", \"content\": baseline_system_prompt},\n",
193
  " {\n",
194
  " \"role\": \"user\",\n",
195
  " \"content\": f\"Translate this from english to arabic: {translation_input}.\\nTranslation: \",\n",
 
200
  },
201
  {
202
  "cell_type": "code",
203
+ "execution_count": 6,
204
  "id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
205
  "metadata": {
206
  "id": "96a0ba0b-be47-4eb0-bbc2-c82b0ea1b72e",
 
210
  "source": [
211
  "translation_input = \"Float like a butterfly sting like a bee – his hands can’t hit what his eyes can’t see.\"\n",
212
  "response = baseline_chat_completion(\n",
 
213
  " translation_input,\n",
214
  ")"
215
  ]
 
222
  },
223
  "source": [
224
  "### Token Cost\n",
225
+ "Here we can see that the cost is quite cheap, only 96 tokens!"
226
  ]
227
  },
228
  {
229
  "cell_type": "code",
230
+ "execution_count": 7,
231
+ "id": "2afc890f-5d8d-4df8-b19a-25888211cf18",
232
  "metadata": {
 
 
233
  "tags": []
234
  },
235
  "outputs": [
236
  {
237
  "data": {
238
  "text/plain": [
239
+ "'Baseline Total Prompt tokens: 96'"
240
  ]
241
  },
242
  "execution_count": 7,
 
245
  }
246
  ],
247
  "source": [
248
+ "f\"Baseline Total Prompt tokens: {response.usage.prompt_tokens - len(tokenizer(translation_input, return_tensors='pt')['input_ids'][0])}\""
249
  ]
250
  },
251
  {
252
  "cell_type": "code",
253
+ "execution_count": 8,
254
  "id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
255
  "metadata": {
256
  "id": "ef24fe6b-d801-4f3e-95ad-cb7f67247bc3",
 
262
  "name": "stdout",
263
  "output_type": "stream",
264
  "text": [
265
+ "يسبح في الحلبة كالفراشة ويلسع كالنحلة - لا يمكن ليديه أن تصيبا ما لا تستطيع عيناه رؤيته\n"
266
  ]
267
  }
268
  ],
 
297
  },
298
  {
299
  "cell_type": "code",
300
+ "execution_count": 9,
301
  "id": "4714f6a2-fd0b-48ee-80bc-860f40ee2baa",
302
  "metadata": {
303
  "id": "4714f6a2-fd0b-48ee-80bc-860f40ee2baa",
 
335
  },
336
  {
337
  "cell_type": "code",
338
+ "execution_count": 10,
339
+ "id": "d0c3418e-0b87-458f-8517-1ca3e59ab57a",
340
  "metadata": {
341
+ "tags": []
342
  },
343
  "outputs": [],
344
  "source": [
345
+ "# Define the translation tool function\n",
346
+ "purpose_driven_translation_tools = [\n",
347
+ " {\n",
348
+ " \"type\": \"function\",\n",
349
+ " \"function\": {\n",
350
+ " \"name\": \"purpose_driven_translation\",\n",
351
+ " \"description\": \"Translate given the purpose and the target audience.\",\n",
352
+ " \"parameters\": {\n",
353
+ " \"type\": \"object\",\n",
354
+ " \"properties\": {\n",
355
+ " \"translation\": {\n",
356
+ " \"type\": \"string\",\n",
357
+ " \"description\": \"The translated \\\"source_text\\\".\",\n",
358
+ " },\n",
359
+ " },\n",
360
+ " \"required\": [\"translation\"],\n",
361
+ " },\n",
362
+ " },\n",
363
+ " }\n",
364
+ "]\n",
365
+ "\n",
366
+ "# Create the purpose-driven chat completion function using function calling\n",
367
+ "def purpose_driven_chat_completion(translation_input, dataset):\n",
368
  " \"\"\"\n",
369
+ " Generates a completion for a chat conversation using a specified system prompt and a user input,\n",
370
+ " incorporating function calling to retrieve translation context.\n",
371
  " \"\"\"\n",
372
+ " \n",
373
+ " # Prepare the prompt\n",
374
+ " prompt = f\"\"\"Translate the English \"source text\" into Arabic. Please fulfill the \"Purpose of the translation\" and tailor it to the \"target audience\". Respond in a json format with just the translation as the key.\n",
375
+ "{{\n",
376
+ " \"Purpose of the translation\": \"{dataset_to_purpose_target[dataset]['purpose']}\"\n",
377
+ " \"Target audience\": \"{dataset_to_purpose_target[dataset]['audience']}\"\n",
378
+ " \"source text\" `{translation_input}`\n",
379
+ "}} \n",
380
+ "Translation json: \"\"\"\n",
381
  "\n",
382
+ " # Initial messages, including the function call to get context\n",
 
 
 
 
 
383
  " messages = [\n",
384
+ " {\"role\": \"system\", \"content\": baseline_system_prompt},\n",
385
  " {\n",
386
  " \"role\": \"user\",\n",
387
  " \"content\": prompt,\n",
388
  " },\n",
389
  " ]\n",
390
+ "\n",
391
+ " \n",
392
+ " # Call the chat completion API with the function tools and specific tool choice\n",
393
+ " return client.chat_completion(messages, max_tokens=10_000, tools=purpose_driven_translation_tools, tool_choice='purpose_driven_translation')"
394
  ]
395
  },
396
  {
397
  "cell_type": "code",
398
+ "execution_count": 11,
399
  "id": "4115515d-cbcd-405a-b2e0-a805880a40c4",
400
  "metadata": {
401
  "id": "4115515d-cbcd-405a-b2e0-a805880a40c4",
 
404
  "outputs": [],
405
  "source": [
406
  "translation_input = \"We have observed that when groups of stakeholders work to define … visions, this leads to debate over whether to emphasize ecosystem health or human well-being … Whether the priority is ecosystems or people greatly influences stakeholders' assessment of desirable ecological and social states.\"\n",
407
+ "response = purpose_driven_chat_completion(translation_input, \"ELRC-24ss\")"
408
  ]
409
  },
410
  {
411
  "cell_type": "code",
412
+ "execution_count": 12,
413
+ "id": "51cc3241-ef6d-43e8-8740-defb6f542918",
414
  "metadata": {
 
 
415
  "tags": []
416
  },
417
  "outputs": [
418
  {
419
  "data": {
420
  "text/plain": [
421
+ "'Manual Purpose Driven Total Prompt tokens: 350'"
422
  ]
423
  },
424
  "execution_count": 12,
 
427
  }
428
  ],
429
  "source": [
430
+ "f\"Manual Purpose Driven Total Prompt tokens: {response.usage.prompt_tokens - len(tokenizer(translation_input, return_tensors='pt')['input_ids'][0])}\""
431
  ]
432
  },
433
  {
434
  "cell_type": "code",
435
+ "execution_count": 13,
436
  "id": "1f1c6dd0-11bf-4b88-9029-8bce1e7bcb1c",
437
  "metadata": {
438
  "id": "1f1c6dd0-11bf-4b88-9029-8bce1e7bcb1c",
 
444
  "name": "stdout",
445
  "output_type": "stream",
446
  "text": [
447
+ "{'translation': 'لاحظنا أنه عندما تعمل مجموعات أصحاب المصلحة على تحديد ... '\n",
448
+ " 'الرؤى، فإن هذا يؤدي إلى نقاش حول ما إذا كان ينبغي التركيز على '\n",
449
+ " 'صحة النظام البيئي أو رفاهية الإنسان ... إن مسألة ما إذا كان '\n",
450
+ " 'الأولوية للنظم البيئية أو الناس تؤثر بشكل كبير على تقييم '\n",
451
+ " 'أصحاب المصلحة للحالات الاجتماعية والبيئية المرغوبة.'}\n"
452
  ]
453
  }
454
  ],
455
  "source": [
456
+ "from pprint import pprint\n",
457
+ "description_json = response.choices[0].message.tool_calls[0].function.arguments\n",
458
+ "pprint(description_json)"
459
  ]
460
  },
461
  {
 
487
  "id": "2f5deb21-18fb-4c5b-9045-c7fe5e751c05"
488
  },
489
  "source": [
490
+ "Its usually helpful if we tell the LLM what we want to create when we prompt it. \n",
491
  "\n",
492
+ "> ```I want to translate the following source_text from English into Arabic. But first I want to create a json that includes the following:\n",
493
+ "{\"subject\": \"\", \"assumptions relating to content\": \"\", \"purpose\": \"\", \"target audience\": \"\"}.\n",
494
+ "Can you fill this out and be specific to how this can help you translate in the next step? No need to translate yet!\n",
495
+ "{\n",
496
+ " \"source_text\": {translation_input}\n",
497
+ "}```\n",
498
+ "\n"
499
  ]
500
  },
501
  {
 
510
  },
511
  {
512
  "cell_type": "code",
513
+ "execution_count": 14,
514
  "id": "affe3668-aa37-47b5-be37-a0bd5dabab56",
515
  "metadata": {
516
  "id": "affe3668-aa37-47b5-be37-a0bd5dabab56",
 
518
  },
519
  "outputs": [],
520
  "source": [
521
+ "automatic_purpose_driven_translation_tools = [\n",
522
  " {\n",
523
  " \"type\": \"function\",\n",
524
  " \"function\": {\n",
 
558
  },
559
  {
560
  "cell_type": "code",
561
+ "execution_count": 15,
562
  "id": "b6436ce6-03af-4206-a283-0c2ecd17bd88",
563
  "metadata": {
564
+ "id": "b6436ce6-03af-4206-a283-0c2ecd17bd88",
565
+ "tags": []
566
  },
567
  "outputs": [],
568
  "source": [
569
+ "def tool_call_chat_completion(translation_input):\n",
570
  " \"\"\"\n",
571
  " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
572
  " \"\"\"\n",
573
  "\n",
574
+ " prompt = f\"\"\"I want to translate the following source_text from English into Arabic. But first I want to create a json that includes the following:\n",
575
  "{{\"subject\": \"\", \"assumptions relating to content\": \"\", \"purpose\": \"\", \"target audience\": \"\"}}.\n",
576
  "Can you fill this out and be specific to how this can help you translate in the next step? No need to translate yet!\n",
577
+ "{{\n",
578
+ " \"source_text\": {translation_input}\n",
579
+ "}}\n",
580
  "\"\"\"\n",
581
  " messages = [\n",
582
+ " {\"role\": \"system\", \"content\": baseline_system_prompt},\n",
583
  " {\n",
584
  " \"role\": \"user\",\n",
585
  " \"content\": prompt,\n",
586
  " },\n",
587
  " ]\n",
588
+ " return client.chat_completion(messages, max_tokens=10_000, tools=automatic_purpose_driven_translation_tools, tool_choice='get_translation_audience_purpose')"
589
  ]
590
  },
591
  {
592
  "cell_type": "code",
593
+ "execution_count": 16,
594
  "id": "a731b2c0-54a3-4b8e-83f6-1663c759cf79",
595
  "metadata": {
596
  "id": "a731b2c0-54a3-4b8e-83f6-1663c759cf79",
 
599
  "outputs": [],
600
  "source": [
601
  "translation_input = \"We have observed that when groups of stakeholders work to define … visions, this leads to debate over whether to emphasize ecosystem health or human well-being … Whether the priority is ecosystems or people greatly influences stakeholders' assessment of desirable ecological and social states.\"\n",
602
+ "response = tool_call_chat_completion(translation_input)"
603
  ]
604
  },
605
  {
606
  "cell_type": "code",
607
+ "execution_count": 17,
608
+ "id": "f87110a2-a40b-4d65-a12d-728dbdda8fbe",
609
  "metadata": {
610
  "id": "07e0f133-f6e1-4bc6-969c-da9d36bfba2f",
611
  "outputId": "82e2506a-7336-4909-b8e4-fc52f671c511",
 
615
  {
616
  "data": {
617
  "text/plain": [
618
+ "'Function Calling Prompt tokens: 406'"
619
  ]
620
  },
621
  "execution_count": 17,
 
624
  }
625
  ],
626
  "source": [
627
+ "f\"Function Calling Prompt tokens: {response.usage.prompt_tokens - len(tokenizer(translation_input, return_tensors='pt')['input_ids'][0])}\""
 
628
  ]
629
  },
630
  {
631
  "cell_type": "code",
632
+ "execution_count": 18,
633
  "id": "02be1827-7137-463f-a026-0b26dec6f552",
634
  "metadata": {
635
  "id": "02be1827-7137-463f-a026-0b26dec6f552",
 
641
  "name": "stdout",
642
  "output_type": "stream",
643
  "text": [
644
+ "{'assumptions relating to the content': 'The source text assumes that there is '\n",
645
+ " 'a debate between ecological health '\n",
646
+ " 'and human well-being, and that '\n",
647
+ " 'stakeholders have different '\n",
648
+ " 'priorities that influence their '\n",
649
+ " 'assessment of desirable ecological '\n",
650
+ " 'and social outcomes.',\n",
651
+ " 'audience': 'Individuals interested in environmental policy, ecology, '\n",
652
+ " 'sustainability, and/or stakeholder engagement.',\n",
653
+ " 'purpose': 'To communicate observations about the varying priorities of '\n",
654
+ " 'different stakeholder groups and how these priorities impact '\n",
655
+ " 'their definition of vision, particularly in the context of '\n",
656
+ " 'ecosystem health versus human well-being.',\n",
657
+ " 'subject': 'Stakeholder priorities and their impact on defining visions '\n",
658
+ " 'related to ecological and social outcomes.'}\n"
 
659
  ]
660
  }
661
  ],
 
667
  },
668
  {
669
  "cell_type": "code",
670
+ "execution_count": 19,
671
  "id": "7575bd09-2d20-49ae-bb10-162a0e469f16",
672
  "metadata": {
673
+ "id": "7575bd09-2d20-49ae-bb10-162a0e469f16",
674
+ "tags": []
675
  },
676
  "outputs": [],
677
  "source": [
678
+ "def automatic_purpose_driven_chat_completion(translation_input, description_json):\n",
679
  " \"\"\"\n",
680
  " Generates a completion for a chat conversation using a specified system prompt and a user input.\n",
681
  " \"\"\"\n",
682
  "\n",
683
+ " prompt = f\"\"\"Given the following description translate source_text from English to Arabic\n",
684
+ "{{\n",
685
+ " \"description\": {description_json},\n",
686
+ " \"translation\": {translation_input}\n",
687
+ "}}\n",
688
  "Translation:\n",
689
  "\"\"\"\n",
690
  " messages = [\n",
691
+ " {\"role\": \"system\", \"content\": baseline_system_prompt},\n",
692
+ " {\"role\": \"user\", \"content\": prompt},\n",
 
 
 
693
  " ]\n",
694
  " return client.chat_completion(messages, max_tokens=10_000)"
695
  ]
696
  },
697
  {
698
  "cell_type": "code",
699
+ "execution_count": 20,
700
  "id": "32bccca0-c866-4006-84d1-d5b783b73689",
701
  "metadata": {
702
  "id": "32bccca0-c866-4006-84d1-d5b783b73689",
 
704
  },
705
  "outputs": [],
706
  "source": [
707
+ "response = automatic_purpose_driven_chat_completion(translation_input, description_json)"
708
  ]
709
  },
710
  {
711
  "cell_type": "code",
712
+ "execution_count": 21,
713
  "id": "b0867efb-39ea-4f9a-b073-2a84261f3821",
714
  "metadata": {
715
  "id": "b0867efb-39ea-4f9a-b073-2a84261f3821",
 
720
  {
721
  "data": {
722
  "text/plain": [
723
+ "'Automatic Purpose Driven Total Prompt tokens: 235'"
724
  ]
725
  },
726
  "execution_count": 21,
 
729
  }
730
  ],
731
  "source": [
732
+ "f\"Automatic Purpose Driven Total Prompt tokens: {response.usage.prompt_tokens - len(tokenizer(translation_input, return_tensors='pt')['input_ids'][0])}\""
 
733
  ]
734
  },
735
  {
736
  "cell_type": "code",
737
+ "execution_count": 22,
738
  "id": "462ca84c-9ffd-4924-880d-e06b724caf02",
739
  "metadata": {
740
  "id": "462ca84c-9ffd-4924-880d-e06b724caf02",
 
746
  "name": "stdout",
747
  "output_type": "stream",
748
  "text": [
749
+ "{\n",
750
+ " \"description\": {\n",
751
+ " \"الافتراضات المتعلقة بالمحتوى\": \"يفترض النص المصدري وجود نقاش بين الصحة البيئية ورفاهية الإنسان، وأن أصحاب المصلحة لديهم أولويات مختلفة تؤثر على تقييمهم للنتائج البيئية والاجتماعية المرجوة.\",\n",
752
+ " \"الجمهور\": \"الأفراد المهتمون بالسياسة البيئية، أو علم البيئة، أو الاستدامة، و/أو مشاركة أصحاب المصلحة.\",\n",
753
+ " \"الغرض\": \"إيصال الملاحظات حول الأولويات المتنوعة لمجموعات أصحاب المصلحة المختلفة، وكيف تؤثر هذه الأولويات على تعريفهم للرؤى، خاصة في سياق صحة الأنظمة البيئية مقابل رفاهية الإنسان.\",\n",
754
+ " \"الموضوع\": \"أولويات أصحاب المصلحة وتأثيرها على تحديد الرؤى المتعلقة بالنتائج البيئية والاجتماعية.\"\n",
755
+ " },\n",
756
+ " \"الترجمة\": \"لاحظنا أنه عندما تعمل مجموعات أصحاب المصلحة على تحديد ... الرؤى، فإن هذا يؤدي إلى نقاش حول ما إذا كان ينبغي التأكيد على صحة النظام البيئي أو رفاهية الإنسان ... سواء كانت الأولوية للنظم البيئية أو للبشر يؤثر بشكل كبير على تقييم أصحاب المصلحة للحالات البيئية والاجتماعية المرغوبة.\"\n",
757
+ "}\n"
758
  ]
759
  }
760
  ],
 
764
  },
765
  {
766
  "cell_type": "markdown",
767
+ "id": "28a0e518-358e-44f0-97a4-ea76a5563743",
768
+ "metadata": {},
769
+ "source": [
770
+ "### Helper Function"
771
+ ]
772
+ },
773
+ {
774
+ "cell_type": "code",
775
+ "execution_count": 23,
776
+ "id": "7fbf9e01-23d9-41b0-a1e3-fd7a99c55bd0",
777
  "metadata": {
778
+ "tags": []
779
  },
780
+ "outputs": [],
781
  "source": [
782
+ "def automatic_purpose_driven_chat(translation_input):\n",
783
+ " response = tool_call_chat_completion(translation_input)\n",
784
+ " description_json = response.choices[0].message.tool_calls[0].function.arguments\n",
785
+ " return automatic_purpose_driven_chat_completion(translation_input, description_json)"
786
  ]
787
  },
788
  {
789
  "cell_type": "code",
790
+ "execution_count": 24,
791
+ "id": "8c5dd9e6-44c1-4ac7-92ef-c2e594b7b91d",
792
  "metadata": {
 
 
793
  "tags": []
794
  },
795
  "outputs": [
796
  {
797
  "data": {
798
  "text/plain": [
799
+ "'الافتراضات المتعلقة بالمحتوى: لا توجد افتراضات محددة.\\n\\nالجمهور المستهدف: جمهور عام لا يحتاج إلى معرفة تقنية محددة.\\n\\nالغرض: نقل رسالة بسيطة لاختبار الترجمة.\\n\\nالموضوع: اختبار الترجمة\\n\\nالترجمة: هذا اختبار'"
800
  ]
801
  },
802
+ "execution_count": 24,
803
  "metadata": {},
804
  "output_type": "execute_result"
805
  }
806
  ],
807
+ "source": [
808
+ "automatic_purpose_driven_chat(\"This is a test\").choices[0].message.content"
809
+ ]
810
+ },
811
+ {
812
+ "cell_type": "markdown",
813
+ "id": "4abf1aeb-ee4e-4b8f-97c3-e6b1664ac8b8",
814
+ "metadata": {
815
+ "id": "1ec3b20b-8393-4fda-a51d-cf67984cc166"
816
+ },
817
+ "source": [
818
+ "## Dataset Creation"
819
+ ]
820
+ },
821
+ {
822
+ "cell_type": "code",
823
+ "execution_count": 25,
824
+ "id": "4b2e3951-6704-43d5-a69b-7587f26e6491",
825
+ "metadata": {
826
+ "tags": []
827
+ },
828
+ "outputs": [
829
+ {
830
+ "name": "stderr",
831
+ "output_type": "stream",
832
+ "text": [
833
+ "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
834
+ "To disable this warning, you can either:\n",
835
+ "\t- Avoid using `tokenizers` before the fork if possible\n",
836
+ "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
837
+ ]
838
+ },
839
+ {
840
+ "data": {
841
+ "application/vnd.jupyter.widget-view+json": {
842
+ "model_id": "e983537088ae4305ada7aff87127eaa2",
843
+ "version_major": 2,
844
+ "version_minor": 0
845
+ },
846
+ "text/plain": [
847
+ "Map: 0%| | 0/24 [00:00<?, ? examples/s]"
848
+ ]
849
+ },
850
+ "metadata": {},
851
+ "output_type": "display_data"
852
+ },
853
+ {
854
+ "data": {
855
+ "application/vnd.jupyter.widget-view+json": {
856
+ "model_id": "0595b2ee75a44cb48f4fef0fbcb75752",
857
+ "version_major": 2,
858
+ "version_minor": 0
859
+ },
860
+ "text/plain": [
861
+ "Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
862
+ ]
863
+ },
864
+ "metadata": {},
865
+ "output_type": "display_data"
866
+ },
867
+ {
868
+ "data": {
869
+ "application/vnd.jupyter.widget-view+json": {
870
+ "model_id": "2e42a9acaf544f6a9dbd7cd3fcb3d381",
871
+ "version_major": 2,
872
+ "version_minor": 0
873
+ },
874
+ "text/plain": [
875
+ "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
876
+ ]
877
+ },
878
+ "metadata": {},
879
+ "output_type": "display_data"
880
+ },
881
+ {
882
+ "data": {
883
+ "application/vnd.jupyter.widget-view+json": {
884
+ "model_id": "6252b1b7f45a4b779ac77748a997256f",
885
+ "version_major": 2,
886
+ "version_minor": 0
887
+ },
888
+ "text/plain": [
889
+ "Downloading data: 0%| | 0.00/5.38k [00:00<?, ?B/s]"
890
+ ]
891
+ },
892
+ "metadata": {},
893
+ "output_type": "display_data"
894
+ },
895
+ {
896
+ "data": {
897
+ "application/vnd.jupyter.widget-view+json": {
898
+ "model_id": "10d2c9d09da54646a6ea06ff09622ad0",
899
+ "version_major": 2,
900
+ "version_minor": 0
901
+ },
902
+ "text/plain": [
903
+ "Generating train split: 0%| | 0/25 [00:00<?, ? examples/s]"
904
+ ]
905
+ },
906
+ "metadata": {},
907
+ "output_type": "display_data"
908
+ },
909
+ {
910
+ "data": {
911
+ "application/vnd.jupyter.widget-view+json": {
912
+ "model_id": "ee28014824554610903cb642534c6cfe",
913
+ "version_major": 2,
914
+ "version_minor": 0
915
+ },
916
+ "text/plain": [
917
+ "Map: 0%| | 0/25 [00:00<?, ? examples/s]"
918
+ ]
919
+ },
920
+ "metadata": {},
921
+ "output_type": "display_data"
922
+ },
923
+ {
924
+ "data": {
925
+ "application/vnd.jupyter.widget-view+json": {
926
+ "model_id": "c26ea784ed514f6382898cdd82e629c0",
927
+ "version_major": 2,
928
+ "version_minor": 0
929
+ },
930
+ "text/plain": [
931
+ "Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
932
+ ]
933
+ },
934
+ "metadata": {},
935
+ "output_type": "display_data"
936
+ },
937
+ {
938
+ "data": {
939
+ "application/vnd.jupyter.widget-view+json": {
940
+ "model_id": "6470a8eba6cc45e1b266f160e07ac5f9",
941
+ "version_major": 2,
942
+ "version_minor": 0
943
+ },
944
+ "text/plain": [
945
+ "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
946
+ ]
947
+ },
948
+ "metadata": {},
949
+ "output_type": "display_data"
950
+ },
951
+ {
952
+ "data": {
953
+ "application/vnd.jupyter.widget-view+json": {
954
+ "model_id": "3e4d136bda254528993b54d6c3b5395a",
955
+ "version_major": 2,
956
+ "version_minor": 0
957
+ },
958
+ "text/plain": [
959
+ "README.md: 0%| | 0.00/599 [00:00<?, ?B/s]"
960
+ ]
961
+ },
962
+ "metadata": {},
963
+ "output_type": "display_data"
964
+ },
965
+ {
966
+ "data": {
967
+ "application/vnd.jupyter.widget-view+json": {
968
+ "model_id": "23978739fcbf40ffb24f78c4e99088bc",
969
+ "version_major": 2,
970
+ "version_minor": 0
971
+ },
972
+ "text/plain": [
973
+ "Downloading data: 0%| | 0.00/13.8k [00:00<?, ?B/s]"
974
+ ]
975
+ },
976
+ "metadata": {},
977
+ "output_type": "display_data"
978
+ },
979
+ {
980
+ "data": {
981
+ "application/vnd.jupyter.widget-view+json": {
982
+ "model_id": "d4bcf66919fb4444b5c2e4f489a57dc2",
983
+ "version_major": 2,
984
+ "version_minor": 0
985
+ },
986
+ "text/plain": [
987
+ "Generating train split: 0%| | 0/25 [00:00<?, ? examples/s]"
988
+ ]
989
+ },
990
+ "metadata": {},
991
+ "output_type": "display_data"
992
+ },
993
+ {
994
+ "data": {
995
+ "application/vnd.jupyter.widget-view+json": {
996
+ "model_id": "a3f4915f12a047daabdcbf50eb133131",
997
+ "version_major": 2,
998
+ "version_minor": 0
999
+ },
1000
+ "text/plain": [
1001
+ "Map: 0%| | 0/25 [00:00<?, ? examples/s]"
1002
+ ]
1003
+ },
1004
+ "metadata": {},
1005
+ "output_type": "display_data"
1006
+ },
1007
+ {
1008
+ "data": {
1009
+ "application/vnd.jupyter.widget-view+json": {
1010
+ "model_id": "4c628855a75447fe812638f60d81e259",
1011
+ "version_major": 2,
1012
+ "version_minor": 0
1013
+ },
1014
+ "text/plain": [
1015
+ "Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
1016
+ ]
1017
+ },
1018
+ "metadata": {},
1019
+ "output_type": "display_data"
1020
+ },
1021
+ {
1022
+ "data": {
1023
+ "application/vnd.jupyter.widget-view+json": {
1024
+ "model_id": "3ec49e43954d43e4bab6581afb5c3b95",
1025
+ "version_major": 2,
1026
+ "version_minor": 0
1027
+ },
1028
+ "text/plain": [
1029
+ "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
1030
+ ]
1031
+ },
1032
+ "metadata": {},
1033
+ "output_type": "display_data"
1034
+ },
1035
+ {
1036
+ "data": {
1037
+ "application/vnd.jupyter.widget-view+json": {
1038
+ "model_id": "be12c10f2f4846a49ad71670bfbf7df3",
1039
+ "version_major": 2,
1040
+ "version_minor": 0
1041
+ },
1042
+ "text/plain": [
1043
+ "README.md: 0%| | 0.00/1.17k [00:00<?, ?B/s]"
1044
+ ]
1045
+ },
1046
+ "metadata": {},
1047
+ "output_type": "display_data"
1048
+ },
1049
+ {
1050
+ "data": {
1051
+ "application/vnd.jupyter.widget-view+json": {
1052
+ "model_id": "85b48f1ca1414aed9d24c09b7ee64ae8",
1053
+ "version_major": 2,
1054
+ "version_minor": 0
1055
+ },
1056
+ "text/plain": [
1057
+ "Downloading data: 0%| | 0.00/5.74k [00:00<?, ?B/s]"
1058
+ ]
1059
+ },
1060
+ "metadata": {},
1061
+ "output_type": "display_data"
1062
+ },
1063
+ {
1064
+ "data": {
1065
+ "application/vnd.jupyter.widget-view+json": {
1066
+ "model_id": "66f25ed30a7a47eba1f62c59a5ef6b1a",
1067
+ "version_major": 2,
1068
+ "version_minor": 0
1069
+ },
1070
+ "text/plain": [
1071
+ "Generating train split: 0%| | 0/25 [00:00<?, ? examples/s]"
1072
+ ]
1073
+ },
1074
+ "metadata": {},
1075
+ "output_type": "display_data"
1076
+ },
1077
+ {
1078
+ "data": {
1079
+ "application/vnd.jupyter.widget-view+json": {
1080
+ "model_id": "0b1856a42f844ab286c628a5ab48ba6c",
1081
+ "version_major": 2,
1082
+ "version_minor": 0
1083
+ },
1084
+ "text/plain": [
1085
+ "Map: 0%| | 0/25 [00:00<?, ? examples/s]"
1086
+ ]
1087
+ },
1088
+ "metadata": {},
1089
+ "output_type": "display_data"
1090
+ },
1091
+ {
1092
+ "data": {
1093
+ "application/vnd.jupyter.widget-view+json": {
1094
+ "model_id": "06b2892ed4da4846baee7b178c42fa40",
1095
+ "version_major": 2,
1096
+ "version_minor": 0
1097
+ },
1098
+ "text/plain": [
1099
+ "Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
1100
+ ]
1101
+ },
1102
+ "metadata": {},
1103
+ "output_type": "display_data"
1104
+ },
1105
+ {
1106
+ "data": {
1107
+ "application/vnd.jupyter.widget-view+json": {
1108
+ "model_id": "92fb9cf2d4bf4b2abf9412d9238bc769",
1109
+ "version_major": 2,
1110
+ "version_minor": 0
1111
+ },
1112
+ "text/plain": [
1113
+ "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
1114
+ ]
1115
+ },
1116
+ "metadata": {},
1117
+ "output_type": "display_data"
1118
+ },
1119
+ {
1120
+ "data": {
1121
+ "application/vnd.jupyter.widget-view+json": {
1122
+ "model_id": "36914c848e654dee81683a379721b8c9",
1123
+ "version_major": 2,
1124
+ "version_minor": 0
1125
+ },
1126
+ "text/plain": [
1127
+ "README.md: 0%| | 0.00/1.74k [00:00<?, ?B/s]"
1128
+ ]
1129
+ },
1130
+ "metadata": {},
1131
+ "output_type": "display_data"
1132
+ },
1133
+ {
1134
+ "data": {
1135
+ "application/vnd.jupyter.widget-view+json": {
1136
+ "model_id": "6b23b1c80d1349c68fea02d8bb4baee1",
1137
+ "version_major": 2,
1138
+ "version_minor": 0
1139
+ },
1140
+ "text/plain": [
1141
+ "Downloading data: 0%| | 0.00/10.9k [00:00<?, ?B/s]"
1142
+ ]
1143
+ },
1144
+ "metadata": {},
1145
+ "output_type": "display_data"
1146
+ },
1147
+ {
1148
+ "data": {
1149
+ "application/vnd.jupyter.widget-view+json": {
1150
+ "model_id": "0b91c12b7b6c4d88a32504e52be379b3",
1151
+ "version_major": 2,
1152
+ "version_minor": 0
1153
+ },
1154
+ "text/plain": [
1155
+ "Generating train split: 0%| | 0/25 [00:00<?, ? examples/s]"
1156
+ ]
1157
+ },
1158
+ "metadata": {},
1159
+ "output_type": "display_data"
1160
+ },
1161
+ {
1162
+ "data": {
1163
+ "application/vnd.jupyter.widget-view+json": {
1164
+ "model_id": "188ee5373fbc4b2c845ae823b073daca",
1165
+ "version_major": 2,
1166
+ "version_minor": 0
1167
+ },
1168
+ "text/plain": [
1169
+ "Map: 0%| | 0/25 [00:00<?, ? examples/s]"
1170
+ ]
1171
+ },
1172
+ "metadata": {},
1173
+ "output_type": "display_data"
1174
+ },
1175
+ {
1176
+ "data": {
1177
+ "application/vnd.jupyter.widget-view+json": {
1178
+ "model_id": "91b7e4137f50487d927dabc859d9f19f",
1179
+ "version_major": 2,
1180
+ "version_minor": 0
1181
+ },
1182
+ "text/plain": [
1183
+ "Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
1184
+ ]
1185
+ },
1186
+ "metadata": {},
1187
+ "output_type": "display_data"
1188
+ },
1189
+ {
1190
+ "data": {
1191
+ "application/vnd.jupyter.widget-view+json": {
1192
+ "model_id": "d2b2d76b482f4fe0a7dcf1ff4cfc1168",
1193
+ "version_major": 2,
1194
+ "version_minor": 0
1195
+ },
1196
+ "text/plain": [
1197
+ "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
1198
+ ]
1199
+ },
1200
+ "metadata": {},
1201
+ "output_type": "display_data"
1202
+ },
1203
+ {
1204
+ "data": {
1205
+ "application/vnd.jupyter.widget-view+json": {
1206
+ "model_id": "4fff6b9680244b85a572cf6373781cec",
1207
+ "version_major": 2,
1208
+ "version_minor": 0
1209
+ },
1210
+ "text/plain": [
1211
+ "README.md: 0%| | 0.00/2.34k [00:00<?, ?B/s]"
1212
+ ]
1213
+ },
1214
+ "metadata": {},
1215
+ "output_type": "display_data"
1216
+ },
1217
+ {
1218
+ "data": {
1219
+ "application/vnd.jupyter.widget-view+json": {
1220
+ "model_id": "21baa00c99e2485d8af5edaf095dfc46",
1221
+ "version_major": 2,
1222
+ "version_minor": 0
1223
+ },
1224
+ "text/plain": [
1225
+ "Downloading data: 0%| | 0.00/16.5k [00:00<?, ?B/s]"
1226
+ ]
1227
+ },
1228
+ "metadata": {},
1229
+ "output_type": "display_data"
1230
+ },
1231
+ {
1232
+ "data": {
1233
+ "application/vnd.jupyter.widget-view+json": {
1234
+ "model_id": "610e87cdb9a44e91b8eb1c0dbe4af60f",
1235
+ "version_major": 2,
1236
+ "version_minor": 0
1237
+ },
1238
+ "text/plain": [
1239
+ "Generating train split: 0%| | 0/24 [00:00<?, ? examples/s]"
1240
+ ]
1241
+ },
1242
+ "metadata": {},
1243
+ "output_type": "display_data"
1244
+ },
1245
+ {
1246
+ "data": {
1247
+ "application/vnd.jupyter.widget-view+json": {
1248
+ "model_id": "d7fe56bef4fd490fa18ad574cc72ba77",
1249
+ "version_major": 2,
1250
+ "version_minor": 0
1251
+ },
1252
+ "text/plain": [
1253
+ "Map: 0%| | 0/24 [00:00<?, ? examples/s]"
1254
+ ]
1255
+ },
1256
+ "metadata": {},
1257
+ "output_type": "display_data"
1258
+ },
1259
+ {
1260
+ "data": {
1261
+ "application/vnd.jupyter.widget-view+json": {
1262
+ "model_id": "14d8c2e7c9144f62ab6d5ba3dae7e96f",
1263
+ "version_major": 2,
1264
+ "version_minor": 0
1265
+ },
1266
+ "text/plain": [
1267
+ "Uploading the dataset shards: 0%| | 0/1 [00:00<?, ?it/s]"
1268
+ ]
1269
+ },
1270
+ "metadata": {},
1271
+ "output_type": "display_data"
1272
+ },
1273
+ {
1274
+ "data": {
1275
+ "application/vnd.jupyter.widget-view+json": {
1276
+ "model_id": "eb21d60a8bf940e797a432469b74d262",
1277
+ "version_major": 2,
1278
+ "version_minor": 0
1279
+ },
1280
+ "text/plain": [
1281
+ "Creating parquet from Arrow format: 0%| | 0/1 [00:00<?, ?ba/s]"
1282
+ ]
1283
+ },
1284
+ "metadata": {},
1285
+ "output_type": "display_data"
1286
+ },
1287
+ {
1288
+ "data": {
1289
+ "application/vnd.jupyter.widget-view+json": {
1290
+ "model_id": "0cc785b4868143f99ffe04a0f9239692",
1291
+ "version_major": 2,
1292
+ "version_minor": 0
1293
+ },
1294
+ "text/plain": [
1295
+ "README.md: 0%| | 0.00/2.91k [00:00<?, ?B/s]"
1296
+ ]
1297
+ },
1298
+ "metadata": {},
1299
+ "output_type": "display_data"
1300
+ }
1301
+ ],
1302
+ "source": [
1303
+ "from datasets import load_dataset\n",
1304
+ "\n",
1305
+ "subsets = ['ELRC-24ss', 'GNOME-25ss', 'HPLT-25ss', 'OpenSubtitles-25ss', 'TED2020-25ss', 'UNPC-24ss']\n",
1306
+ "\n",
1307
+ "# Iterate over each subset\n",
1308
+ "for subset in subsets:\n",
1309
+ " # Load the dataset for the specific subset\n",
1310
+ " dataset = load_dataset(\"arabic-translation-prompt-engineering/TpDwD\", subset)\n",
1311
+ "\n",
1312
+ " # Rename the columns\n",
1313
+ " dataset = dataset.rename_column(\"ar_text\", \"human_translation\")\n",
1314
+ " dataset = dataset.rename_column(\"en_text\", \"source_text\")\n",
1315
+ "\n",
1316
+ " # Apply functions to add new columns\n",
1317
+ " dataset = dataset.map(lambda example: {\n",
1318
+ " \"baseline_translation\": baseline_chat_completion(example['source_text']).choices[0].message.content,\n",
1319
+ " \"purpose_driven_translation\": purpose_driven_chat_completion(example['source_text'], subset).choices[0].message.tool_calls[0].function.arguments['translation'],\n",
1320
+ " \"automatic_purpose_driven_translation\": automatic_purpose_driven_chat(example['source_text']).choices[0].message.content\n",
1321
+ " })\n",
1322
+ " \n",
1323
+ " # Push the processed dataset to the Hub\n",
1324
+ " dataset.push_to_hub(f\"arabic-translation-prompt-engineering/TpDwD_translated\",subset)\n"
1325
+ ]
1326
+ },
1327
+ {
1328
+ "cell_type": "markdown",
1329
+ "id": "1ec3b20b-8393-4fda-a51d-cf67984cc166",
1330
+ "metadata": {
1331
+ "id": "1ec3b20b-8393-4fda-a51d-cf67984cc166"
1332
+ },
1333
+ "source": [
1334
+ "# Push to the hub"
1335
+ ]
1336
+ },
1337
+ {
1338
+ "cell_type": "code",
1339
+ "execution_count": null,
1340
+ "id": "6fc55725-216f-45dd-9c6d-dae77e16d606",
1341
+ "metadata": {
1342
+ "id": "6fc55725-216f-45dd-9c6d-dae77e16d606",
1343
+ "outputId": "d3b9b994-e3af-4f05-88b4-3b29778d1dc7",
1344
+ "tags": []
1345
+ },
1346
+ "outputs": [],
1347
  "source": [
1348
  "from huggingface_hub import HfApi\n",
1349
  "\n",