{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", "langchain-experimental 0.0.64 requires langchain-community<0.3.0,>=0.2.10, but you have langchain-community 0.3.0 which is incompatible.\n", "langchain-experimental 0.0.64 requires langchain-core<0.3.0,>=0.2.27, but you have langchain-core 0.3.1 which is incompatible.\n", "langgraph 0.2.16 requires langchain-core<0.3,>=0.2.27, but you have langchain-core 0.3.1 which is incompatible.\n", "langchain-huggingface 0.0.3 requires langchain-core<0.3,>=0.1.52, but you have langchain-core 0.3.1 which is incompatible.\n", "langgraph-checkpoint 1.0.6 requires langchain-core<0.3,>=0.2.22, but you have langchain-core 0.3.1 which is incompatible.\u001b[0m\u001b[31m\n", "\u001b[0m" ] } ], "source": [ "!pip install -qU langsmith langchain-core langchain-community langchain-openai langchain-qdrant" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "!pip install -qU pymupdf ragas" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "import os\n", "import getpass\n", "\n", "os.environ[\"LANGCHAIN_TRACING_V2\"] = \"true\"\n", "os.environ[\"LANGCHAIN_API_KEY\"] = getpass.getpass(\"LangChain API Key:\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "from uuid import uuid4\n", "\n", "os.environ[\"LANGCHAIN_PROJECT\"] = f\"AIM_Midterm - SDG - {uuid4().hex[0:8]}\"" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "os.environ[\"OPENAI_API_KEY\"] = getpass.getpass(\"OpenAI API Key:\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# RAG Chain" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "from langchain_community.document_loaders import PyMuPDFLoader\n", "\n", "filepath_NIST = \"data/NIST.AI.600-1.pdf\"\n", "filepath_Blueprint = \"data/Blueprint-for-an-AI-Bill-of-Rights.pdf\"\n", "\n", "documents_NIST = PyMuPDFLoader(filepath_NIST).load()\n", "documents_Blueprint = PyMuPDFLoader(filepath_Blueprint).load()\n" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "documents = documents_NIST + documents_Blueprint\n", "# rag_documents = PyMuPDFLoader(documents).load()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "\n", "text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size = 500,\n", " chunk_overlap = 50\n", ")\n", "\n", "rag_documents = text_splitter.split_documents(documents)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "from langchain_openai import OpenAIEmbeddings\n", "from langchain_community.vectorstores import Qdrant\n", "from langchain_qdrant import QdrantVectorStore\n", "from qdrant_client import QdrantClient\n", "from qdrant_client.http.models import Distance, VectorParams\n", "\n", "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", "\n", "vectorstore = Qdrant.from_documents(\n", " documents=rag_documents,\n", " embedding=embeddings,\n", " location=\":memory:\",\n", " collection_name=\"Implications of AI\"\n", ")\n", "\n", "retriever = vectorstore.as_retriever()" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "from langchain.prompts import ChatPromptTemplate\n", "\n", "RAG_PROMPT = \"\"\"\\\n", "Given a provided context and question, you must answer the question based only on context.\n", "\n", "If you cannot answer the question based on the context - you must say \"I don't know\".\n", "\n", "Context: {context}\n", "Question: {question}\n", "\"\"\"\n", "\n", "rag_prompt = ChatPromptTemplate.from_template(RAG_PROMPT)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Generate synthetic data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "efacf74e912843b9942183b711af9f27", "version_major": 2, "version_minor": 0 }, "text/plain": [ "embedding nodes: 0%| | 0/284 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questioncontextsground_truthevolution_typemetadataepisode_done
0How do language models contribute to the reduc...[ \\n57 \\nNational Institute of Standards and T...The answer to given question is not present in...simple[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...True
1What should be provided in terms of notice and...[ \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCONSI...Those impacted by an automated system should b...simple[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
2How can designers, developers, and deployers o...[ ­­­­­­­\\nALGORITHMIC DISCRIMINATION Protecti...Designers, developers, and deployers of automa...simple[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
3What benefits have been publicly described by ...[ \\nENDNOTES\\n12. Expectations about reporting...The benefits of 'traffic calming' measures hav...simple[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
4What is the purpose of AI Red-teaming in testi...[ \\n49 \\nearly lifecycle TEVV approaches are d...AI Red-teaming is a structured testing exercis...simple[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...True
5What is the importance of training and assessm...[ \\n \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCO...Training and assessment are crucial in ensurin...simple[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
6How do advertisement delivery systems reinforc...[ \\n \\n \\nWHY THIS PRINCIPLE IS IMPORTANT\\nTh...Advertisement delivery systems reinforce racia...simple[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
7What is the purpose of the Blueprint for an AI...[ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nAbo...The Blueprint for an AI Bill of Rights is inte...simple[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
8What are the key privacy protections provided ...[ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n...The Privacy Act of 1974 provides privacy prote...simple[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
9How does the Fair Credit Reporting Act ensure ...[ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n...The Fair Credit Reporting Act ensures that con...simple[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
10Why is ongoing monitoring and review important...[ \\n16 \\nGOVERN 1.5: Ongoing monitoring and pe...Ongoing monitoring and periodic review of the ...multi_context[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...True
11What criteria should automated systems in sens...[ \\n \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCO...Automated systems used within sensitive domain...multi_context[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
12How can stakeholders prevent algorithmic discr...[ ­­­­­­­\\nALGORITHMIC DISCRIMINATION Protecti...Stakeholders can prevent algorithmic discrimin...multi_context[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
13How to evaluate GAI system outputs against ris...[ \\n40 \\nMANAGE 1.3: Responses to the AI risks...The answer to given question is not present in...multi_context[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...True
14How can data tracking and content detection he...[ \\n51 \\ngeneral public participants. For exam...Data tracking and content detection can help r...multi_context[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...True
15What's needed for an algorithmic impact assess...[ \\n \\n \\n \\n \\n \\n \\nWHAT SHOULD BE EXPECTED ...The answer to given question is not present in...multi_context[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
16What details must California warehouse employe...[ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n...Warehousing employers in California that use q...multi_context[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
17What measures are needed for handling GAI syst...[ \\n42 \\nMG-2.4-002 \\nEstablish and maintain p...Establish and maintain procedures for escalati...multi_context[{'source': 'data/NIST.AI.600-1.pdf', 'file_pa...True
18How can privacy be ensured in automated system...[ \\n \\n \\n \\n \\n \\nDATA PRIVACY \\nWHAT SHOULD ...Privacy in automated system design can be ensu...reasoning[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
19How can algorithmic discrimination be prevente...[ \\n \\n \\n \\n \\n \\nDATA PRIVACY \\nWHAT SHOULD ...Algorithmic discrimination in surveillance sys...reasoning[{'source': 'data/Blueprint-for-an-AI-Bill-of-...True
\n", "" ], "text/plain": [ " question \\\n", "0 How do language models contribute to the reduc... \n", "1 What should be provided in terms of notice and... \n", "2 How can designers, developers, and deployers o... \n", "3 What benefits have been publicly described by ... \n", "4 What is the purpose of AI Red-teaming in testi... \n", "5 What is the importance of training and assessm... \n", "6 How do advertisement delivery systems reinforc... \n", "7 What is the purpose of the Blueprint for an AI... \n", "8 What are the key privacy protections provided ... \n", "9 How does the Fair Credit Reporting Act ensure ... \n", "10 Why is ongoing monitoring and review important... \n", "11 What criteria should automated systems in sens... \n", "12 How can stakeholders prevent algorithmic discr... \n", "13 How to evaluate GAI system outputs against ris... \n", "14 How can data tracking and content detection he... \n", "15 What's needed for an algorithmic impact assess... \n", "16 What details must California warehouse employe... \n", "17 What measures are needed for handling GAI syst... \n", "18 How can privacy be ensured in automated system... \n", "19 How can algorithmic discrimination be prevente... \n", "\n", " contexts \\\n", "0 [ \\n57 \\nNational Institute of Standards and T... \n", "1 [ \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCONSI... \n", "2 [ ­­­­­­­\\nALGORITHMIC DISCRIMINATION Protecti... \n", "3 [ \\nENDNOTES\\n12. Expectations about reporting... \n", "4 [ \\n49 \\nearly lifecycle TEVV approaches are d... \n", "5 [ \\n \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCO... \n", "6 [ \\n \\n \\nWHY THIS PRINCIPLE IS IMPORTANT\\nTh... \n", "7 [ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nAbo... \n", "8 [ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n... \n", "9 [ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n... \n", "10 [ \\n16 \\nGOVERN 1.5: Ongoing monitoring and pe... \n", "11 [ \\n \\n \\n \\n \\n \\n \\nHUMAN ALTERNATIVES, \\nCO... \n", "12 [ ­­­­­­­\\nALGORITHMIC DISCRIMINATION Protecti... \n", "13 [ \\n40 \\nMANAGE 1.3: Responses to the AI risks... \n", "14 [ \\n51 \\ngeneral public participants. For exam... \n", "15 [ \\n \\n \\n \\n \\n \\n \\nWHAT SHOULD BE EXPECTED ... \n", "16 [ \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n... \n", "17 [ \\n42 \\nMG-2.4-002 \\nEstablish and maintain p... \n", "18 [ \\n \\n \\n \\n \\n \\nDATA PRIVACY \\nWHAT SHOULD ... \n", "19 [ \\n \\n \\n \\n \\n \\nDATA PRIVACY \\nWHAT SHOULD ... \n", "\n", " ground_truth evolution_type \\\n", "0 The answer to given question is not present in... simple \n", "1 Those impacted by an automated system should b... simple \n", "2 Designers, developers, and deployers of automa... simple \n", "3 The benefits of 'traffic calming' measures hav... simple \n", "4 AI Red-teaming is a structured testing exercis... simple \n", "5 Training and assessment are crucial in ensurin... simple \n", "6 Advertisement delivery systems reinforce racia... simple \n", "7 The Blueprint for an AI Bill of Rights is inte... simple \n", "8 The Privacy Act of 1974 provides privacy prote... simple \n", "9 The Fair Credit Reporting Act ensures that con... simple \n", "10 Ongoing monitoring and periodic review of the ... multi_context \n", "11 Automated systems used within sensitive domain... multi_context \n", "12 Stakeholders can prevent algorithmic discrimin... multi_context \n", "13 The answer to given question is not present in... multi_context \n", "14 Data tracking and content detection can help r... multi_context \n", "15 The answer to given question is not present in... multi_context \n", "16 Warehousing employers in California that use q... multi_context \n", "17 Establish and maintain procedures for escalati... multi_context \n", "18 Privacy in automated system design can be ensu... reasoning \n", "19 Algorithmic discrimination in surveillance sys... reasoning \n", "\n", " metadata episode_done \n", "0 [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa... True \n", "1 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "2 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "3 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "4 [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa... True \n", "5 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "6 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "7 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "8 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "9 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "10 [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa... True \n", "11 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "12 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "13 [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa... True \n", "14 [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa... True \n", "15 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "16 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "17 [{'source': 'data/NIST.AI.600-1.pdf', 'file_pa... True \n", "18 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True \n", "19 [{'source': 'data/Blueprint-for-an-AI-Bill-of-... True " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from langchain_community.document_loaders import PyMuPDFLoader\n", "from ragas.testset.generator import TestsetGenerator\n", "from ragas.testset.evolutions import simple, reasoning, multi_context\n", "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n", "\n", "\n", "filepath_NIST = \"data/NIST.AI.600-1.pdf\"\n", "filepath_Blueprint = \"data/Blueprint-for-an-AI-Bill-of-Rights.pdf\"\n", "\n", "documents_NIST = PyMuPDFLoader(filepath_NIST).load()\n", "documents_Blueprint = PyMuPDFLoader(filepath_Blueprint).load()\n", "documents = documents_NIST + documents_Blueprint\n", "\n", "generator_llm = ChatOpenAI(model=\"gpt-3.5-turbo\")\n", "critic_llm = ChatOpenAI(model=\"gpt-4o-mini\", tags=[\"base_llm\"]) \n", "embeddings = OpenAIEmbeddings(model=\"text-embedding-3-small\")\n", "\n", "generator = TestsetGenerator.from_langchain(\n", " generator_llm,\n", " critic_llm,\n", " embeddings\n", ")\n", "\n", "distributions = {\n", " simple: 0.5,\n", " multi_context: 0.4,\n", " reasoning: 0.1\n", "}\n", "\n", "testset = generator.generate_with_langchain_docs(documents, 20, distributions, with_debugging_logs=True)\n", "testset.to_pandas()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Dataset creation" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "\n", "from langsmith import Client\n", "from datasets import Dataset\n", "\n", "\n", "client = Client()\n", "\n", "dataset_name = \"Implications of AI\"\n", "\n", "dataset = client.create_dataset(\n", " dataset_name=dataset_name,\n", " description=\"Questions about the implications of AI\"\n", ")" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "for test in testset.to_pandas().iterrows():\n", " client.create_example(\n", " inputs={\n", " \"question\": test[1][\"question\"]\n", " },\n", " outputs={\n", " \"answer\": test[1][\"ground_truth\"]\n", " },\n", " metadata={\n", " \"context\": test[0]\n", " },\n", " dataset_id=dataset.id\n", " )\n", "\n" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "How do language models contribute to the reduction of content diversity in writing?\n", "The answer to given question is not present in context\n" ] } ], "source": [ "test_questions = testset.to_pandas()[\"question\"].values.tolist()\n", "test_groundtruths = testset.to_pandas()[\"ground_truth\"].values.tolist()\n", "\n", "print(test_questions[0])\n", "print(test_groundtruths[0])" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'response': AIMessage(content=\"I don't know.\", additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 4, 'prompt_tokens': 1238, 'total_tokens': 1242, 'completion_tokens_details': {'reasoning_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_2d87079ca9', 'finish_reason': 'stop', 'logprobs': None}, id='run-6db82f54-ddff-4079-b8a4-dd0dbe43a358-0', usage_metadata={'input_tokens': 1238, 'output_tokens': 4, 'total_tokens': 1242}), 'context': [Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 6, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '09678965559446c18cbfacbdc1e2979f', '_collection_name': 'Implications of AI'}, page_content='3 \\nthe abuse, misuse, and unsafe repurposing by humans (adversarial or not), and others result \\nfrom interactions between a human and an AI system. \\n• \\nTime scale: GAI risks may materialize abruptly or across extended periods. Examples include \\nimmediate (and/or prolonged) emotional harm and potential risks to physical safety due to the \\ndistribution of harmful deepfake images, or the long-term effect of disinformation on societal \\ntrust in public institutions.'), Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 6, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': 'ed31388e1ee34066941bc3339ce0309e', '_collection_name': 'Implications of AI'}, page_content='Scientific Report on the Safety of Advanced AI, could be: 1) Technical / Model risks (or risk from malfunction): \\nConfabulation; Dangerous or Violent Recommendations; Data Privacy; Value Chain and Component Integration; \\nHarmful Bias, and Homogenization; 2) Misuse by humans (or malicious use): CBRN Information or Capabilities; \\nData Privacy; Human-AI Configuration; Obscene, Degrading, and/or Abusive Content; Information Integrity;'), Document(metadata={'source': 'data/NIST.AI.600-1.pdf', 'file_path': 'data/NIST.AI.600-1.pdf', 'page': 58, 'total_pages': 64, 'format': 'PDF 1.6', 'title': 'Artificial Intelligence Risk Management Framework: Generative Artificial Intelligence Profile', 'author': 'National Institute of Standards and Technology', 'subject': '', 'keywords': '', 'creator': 'Acrobat PDFMaker 24 for Word', 'producer': 'Adobe PDF Library 24.2.159', 'creationDate': \"D:20240805141702-04'00'\", 'modDate': \"D:20240805143048-04'00'\", 'trapped': '', '_id': '771251b765e44f43b849dc70ed71ec1b', '_collection_name': 'Implications of AI'}, page_content='55 \\nDe Angelo, D. (2024) Short, Mid and Long-Term Impacts of AI in Cybersecurity. Palo Alto Networks. \\nhttps://www.paloaltonetworks.com/blog/2024/02/impacts-of-ai-in-cybersecurity/ \\nDe Freitas, J. et al. (2023) Chatbots and Mental Health: Insights into the Safety of Generative AI. Harvard \\nBusiness School. https://www.hbs.edu/ris/Publication%20Files/23-011_c1bdd417-f717-47b6-bccb-\\n5438c6e65c1a_f6fd9798-3c2d-4932-b222-056231fe69d7.pdf'), Document(metadata={'source': 'data/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'file_path': 'data/Blueprint-for-an-AI-Bill-of-Rights.pdf', 'page': 3, 'total_pages': 73, 'format': 'PDF 1.6', 'title': 'Blueprint for an AI Bill of Rights', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Adobe Illustrator 26.3 (Macintosh)', 'producer': 'iLovePDF', 'creationDate': \"D:20220920133035-04'00'\", 'modDate': \"D:20221003104118-04'00'\", 'trapped': '', '_id': '4f4a06bdfdb44c28ae0299187309082f', '_collection_name': 'Implications of AI'}, page_content='discussions include that AI has transformative potential to improve Americans’ lives, and that preventing the \\nharms of these technologies is both necessary and achievable. The Appendix includes a full list of public engage-\\nments. \\n4')]}\n" ] } ], "source": [ "from langchain_openai import ChatOpenAI\n", "from operator import itemgetter\n", "from langchain_core.runnables import RunnablePassthrough, RunnableParallel\n", "from langchain.schema import StrOutputParser\n", "from langchain_core.runnables import RunnablePassthrough\n", "\n", "llm = ChatOpenAI(model=\"gpt-4o-mini\", tags=[\"base_llm\"]) \n", "\n", "rag_chain = (\n", " {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n", " | rag_prompt | llm | StrOutputParser()\n", ")\n", "\n", "rag_qa_chain = (\n", " {\"context\": itemgetter(\"question\") | retriever, \"question\": itemgetter(\"question\")}\n", " | RunnablePassthrough.assign(context=itemgetter(\"context\"))\n", " | {\"response\": rag_prompt | llm, \"context\": itemgetter(\"context\")}\n", ")\n", "\n", "result = rag_qa_chain.invoke({\"question\" : \"Is AI a threat to humanity?\"})\n", "print(result)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "answers = []\n", "contexts = []\n", "\n", "for question in test_questions:\n", " response = rag_qa_chain.invoke({\"question\" : question})\n", " answers.append(response[\"response\"].content)\n", " contexts.append([context.page_content for context in response[\"context\"]])" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'question': 'How do language models contribute to the reduction of content diversity in writing?',\n", " 'answer': 'Language models can contribute to the reduction of content diversity in writing by producing overly homogenized outputs, which can be incorrect or lead to unreliable decision-making and amplify harmful biases. This phenomenon can flow from foundation models to downstream models and systems, with the foundation models acting as “bottlenecks” or single points of failure. Overly homogenized content can also contribute to what is referred to as “model collapse.”',\n", " 'contexts': ['https://doi.org/10.1787/2448f04b-en \\nOECD (2024) \"Defining AI incidents and related terms\" OECD Artificial Intelligence Papers, No. 16, OECD \\nPublishing, Paris. https://doi.org/10.1787/d1a8d965-en \\nOpenAI (2023) GPT-4 System Card. https://cdn.openai.com/papers/gpt-4-system-card.pdf \\nOpenAI (2024) GPT-4 Technical Report. https://arxiv.org/pdf/2303.08774 \\nPadmakumar, V. et al. (2024) Does writing with language models reduce content diversity? ICLR. \\nhttps://arxiv.org/pdf/2309.05196',\n", " '9 \\nand reduced content diversity). Overly homogenized outputs can themselves be incorrect, or they may \\nlead to unreliable decision-making or amplify harmful biases. These phenomena can flow from \\nfoundation models to downstream models and systems, with the foundation models acting as \\n“bottlenecks,” or single points of failure. \\nOverly homogenized content can contribute to “model collapse.” Model collapse can occur when model',\n", " 'https://arxiv.org/pdf/2310.11986 \\nWeidinger, L. et al. (2022) Taxonomy of Risks posed by Language Models. FAccT ’22. \\nhttps://dl.acm.org/doi/pdf/10.1145/3531146.3533088 \\nWest, D. (2023) AI poses disproportionate risks to women. Brookings. \\nhttps://www.brookings.edu/articles/ai-poses-disproportionate-risks-to-women/ \\nWu, K. et al. (2024) How well do LLMs cite relevant medical references? An evaluation framework and \\nanalyses. arXiv. https://arxiv.org/pdf/2402.02008',\n", " 'Shumailov, I. et al. (2023) The curse of recursion: training on generated data makes models forget. arXiv. \\nhttps://arxiv.org/pdf/2305.17493v2 \\nSmith, A. et al. (2023) Hallucination or Confabulation? Neuroanatomy as metaphor in Large Language \\nModels. PLOS Digital Health. \\nhttps://journals.plos.org/digitalhealth/article?id=10.1371/journal.pdig.0000388 \\nSoice, E. et al. (2023) Can large language models democratize access to dual-use biotechnology? arXiv. \\nhttps://arxiv.org/abs/2306.03809'],\n", " 'ground_truth': 'The answer to given question is not present in context'}" ] }, "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from datasets import Dataset\n", "\n", "response_dataset = Dataset.from_dict({\n", " \"question\" : test_questions,\n", " \"answer\" : answers,\n", " \"contexts\" : contexts,\n", " \"ground_truth\" : test_groundtruths\n", "})\n", "response_dataset[0]" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "\n", "# text_splitter = RecursiveCharacterTextSplitter(\n", "# chunk_size = 500,\n", "# chunk_overlap = 50\n", "# )\n", "\n", "# rag_documents = text_splitter.split_documents(rag_documents)" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "from ragas import evaluate\n", "from ragas.metrics import (\n", " faithfulness,\n", " answer_relevancy,\n", " answer_correctness,\n", " context_recall,\n", " context_precision,\n", ")\n", "\n", "metrics = [\n", " faithfulness,\n", " answer_relevancy,\n", " context_recall,\n", " context_precision,\n", " answer_correctness,\n", "]" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1c87e1fd34114cc9802a54a9ab8e1ca1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Evaluating: 0%| | 0/100 [00:00\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
questioncontextsanswerground_truthfaithfulnessanswer_relevancycontext_recallcontext_precisionanswer_correctness
0How do language models contribute to the reduc...[https://doi.org/10.1787/2448f04b-en \\nOECD (2...Language models can contribute to the reductio...The answer to given question is not present in...1.0000000.9672191.000.0000000.178897
1What should be provided in terms of notice and...[alternative, where appropriate \\nBrief, clear...There should be a brief, clear notice that ind...Those impacted by an automated system should b...1.0000000.9580771.001.0000000.952916
2How can designers, developers, and deployers o...[systems have the capacity to drive extraordin...Designers, developers, and deployers of automa...Designers, developers, and deployers of automa...1.0000000.9454991.001.0000000.770302
3What benefits have been publicly described by ...[15. See, e.g., Charles Pruitt. People Doing W...I don't know.The benefits of 'traffic calming' measures hav...0.0000000.0000001.001.0000000.181544
4What is the purpose of AI Red-teaming in testi...[sense of AI-generated information, and subseq...The purpose of AI Red-teaming in testing AI sy...AI Red-teaming is a structured testing exercis...1.0000001.0000001.000.9166670.817249
5What is the importance of training and assessm...[Training and assessment. Anyone administering...Training and assessment are important in ensur...Training and assessment are crucial in ensurin...0.7500000.9801110.751.0000000.862464
6How do advertisement delivery systems reinforc...[ering ads in ways that reinforce racial and g...Advertisement delivery systems reinforce racia...Advertisement delivery systems reinforce racia...1.0000001.0000001.001.0000000.846395
7What is the purpose of the Blueprint for an AI...[Examples of automated systems for which the B...The purpose of the Blueprint for an AI Bill of...The Blueprint for an AI Bill of Rights is inte...1.0000000.9745531.001.0000000.993355
8What are the key privacy protections provided ...[records systems, including limits on data ret...The key privacy protections provided by the Pr...The Privacy Act of 1974 provides privacy prote...1.0000001.0000001.001.0000000.845967
9How does the Fair Credit Reporting Act ensure ...[beyond simple notice to include reporting ele...The Fair Credit Reporting Act ensures that con...The Fair Credit Reporting Act ensures that con...1.0000000.9158131.001.0000000.620742
10Why is ongoing monitoring and review important...[16 \\nGOVERN 1.5: Ongoing monitoring and perio...Ongoing monitoring and periodic review are imp...Ongoing monitoring and periodic review of the ...0.6875000.9532970.601.0000000.614902
11What criteria should automated systems in sens...[HUMAN ALTERNATIVES, \\nCONSIDERATION, AND \\nFA...Automated systems in sensitive domains should ...Automated systems used within sensitive domain...1.0000000.9291810.001.0000000.979904
12How can stakeholders prevent algorithmic discr...[systems have the capacity to drive extraordin...Stakeholders can prevent algorithmic discrimin...Stakeholders can prevent algorithmic discrimin...0.8571431.0000001.001.0000000.831405
13How to evaluate GAI system outputs against ris...[GAI resources; Apply organizational risk tole...To evaluate GAI system outputs against risk to...The answer to given question is not present in...0.8888890.9677551.000.0000000.176636
14How can data tracking and content detection he...[assessments, and alerting, dynamic risk asses...I don't know.Data tracking and content detection can help r...0.0000000.0000000.501.0000000.182611
15What's needed for an algorithmic impact assess...[consultation, design stage equity assessments...An algorithmic impact assessment for automated...The answer to given question is not present in...1.0000000.9857431.000.0000000.173255
16What details must California warehouse employe...[tion about quotas, potentially facilitated by...California warehouse employers are required to...Warehousing employers in California that use q...0.2000000.9457620.500.8055560.541300
17What measures are needed for handling GAI syst...[17 \\nGOVERN 1.7: Processes and procedures are...For handling GAI system incidents and decommis...Establish and maintain procedures for escalati...1.0000001.0000000.501.0000000.755853
18How can privacy be ensured in automated system...[Protect privacy by design and by default \\nPr...Privacy can be ensured in automated system des...Privacy in automated system design can be ensu...1.0000001.0000000.401.0000000.840599
19How can algorithmic discrimination be prevente...[systems have the capacity to drive extraordin...Algorithmic discrimination can be prevented in...Algorithmic discrimination in surveillance sys...1.0000001.0000001.001.0000000.700298
\n", "" ], "text/plain": [ " question \\\n", "0 How do language models contribute to the reduc... \n", "1 What should be provided in terms of notice and... \n", "2 How can designers, developers, and deployers o... \n", "3 What benefits have been publicly described by ... \n", "4 What is the purpose of AI Red-teaming in testi... \n", "5 What is the importance of training and assessm... \n", "6 How do advertisement delivery systems reinforc... \n", "7 What is the purpose of the Blueprint for an AI... \n", "8 What are the key privacy protections provided ... \n", "9 How does the Fair Credit Reporting Act ensure ... \n", "10 Why is ongoing monitoring and review important... \n", "11 What criteria should automated systems in sens... \n", "12 How can stakeholders prevent algorithmic discr... \n", "13 How to evaluate GAI system outputs against ris... \n", "14 How can data tracking and content detection he... \n", "15 What's needed for an algorithmic impact assess... \n", "16 What details must California warehouse employe... \n", "17 What measures are needed for handling GAI syst... \n", "18 How can privacy be ensured in automated system... \n", "19 How can algorithmic discrimination be prevente... \n", "\n", " contexts \\\n", "0 [https://doi.org/10.1787/2448f04b-en \\nOECD (2... \n", "1 [alternative, where appropriate \\nBrief, clear... \n", "2 [systems have the capacity to drive extraordin... \n", "3 [15. See, e.g., Charles Pruitt. People Doing W... \n", "4 [sense of AI-generated information, and subseq... \n", "5 [Training and assessment. Anyone administering... \n", "6 [ering ads in ways that reinforce racial and g... \n", "7 [Examples of automated systems for which the B... \n", "8 [records systems, including limits on data ret... \n", "9 [beyond simple notice to include reporting ele... \n", "10 [16 \\nGOVERN 1.5: Ongoing monitoring and perio... \n", "11 [HUMAN ALTERNATIVES, \\nCONSIDERATION, AND \\nFA... \n", "12 [systems have the capacity to drive extraordin... \n", "13 [GAI resources; Apply organizational risk tole... \n", "14 [assessments, and alerting, dynamic risk asses... \n", "15 [consultation, design stage equity assessments... \n", "16 [tion about quotas, potentially facilitated by... \n", "17 [17 \\nGOVERN 1.7: Processes and procedures are... \n", "18 [Protect privacy by design and by default \\nPr... \n", "19 [systems have the capacity to drive extraordin... \n", "\n", " answer \\\n", "0 Language models can contribute to the reductio... \n", "1 There should be a brief, clear notice that ind... \n", "2 Designers, developers, and deployers of automa... \n", "3 I don't know. \n", "4 The purpose of AI Red-teaming in testing AI sy... \n", "5 Training and assessment are important in ensur... \n", "6 Advertisement delivery systems reinforce racia... \n", "7 The purpose of the Blueprint for an AI Bill of... \n", "8 The key privacy protections provided by the Pr... \n", "9 The Fair Credit Reporting Act ensures that con... \n", "10 Ongoing monitoring and periodic review are imp... \n", "11 Automated systems in sensitive domains should ... \n", "12 Stakeholders can prevent algorithmic discrimin... \n", "13 To evaluate GAI system outputs against risk to... \n", "14 I don't know. \n", "15 An algorithmic impact assessment for automated... \n", "16 California warehouse employers are required to... \n", "17 For handling GAI system incidents and decommis... \n", "18 Privacy can be ensured in automated system des... \n", "19 Algorithmic discrimination can be prevented in... \n", "\n", " ground_truth faithfulness \\\n", "0 The answer to given question is not present in... 1.000000 \n", "1 Those impacted by an automated system should b... 1.000000 \n", "2 Designers, developers, and deployers of automa... 1.000000 \n", "3 The benefits of 'traffic calming' measures hav... 0.000000 \n", "4 AI Red-teaming is a structured testing exercis... 1.000000 \n", "5 Training and assessment are crucial in ensurin... 0.750000 \n", "6 Advertisement delivery systems reinforce racia... 1.000000 \n", "7 The Blueprint for an AI Bill of Rights is inte... 1.000000 \n", "8 The Privacy Act of 1974 provides privacy prote... 1.000000 \n", "9 The Fair Credit Reporting Act ensures that con... 1.000000 \n", "10 Ongoing monitoring and periodic review of the ... 0.687500 \n", "11 Automated systems used within sensitive domain... 1.000000 \n", "12 Stakeholders can prevent algorithmic discrimin... 0.857143 \n", "13 The answer to given question is not present in... 0.888889 \n", "14 Data tracking and content detection can help r... 0.000000 \n", "15 The answer to given question is not present in... 1.000000 \n", "16 Warehousing employers in California that use q... 0.200000 \n", "17 Establish and maintain procedures for escalati... 1.000000 \n", "18 Privacy in automated system design can be ensu... 1.000000 \n", "19 Algorithmic discrimination in surveillance sys... 1.000000 \n", "\n", " answer_relevancy context_recall context_precision answer_correctness \n", "0 0.967219 1.00 0.000000 0.178897 \n", "1 0.958077 1.00 1.000000 0.952916 \n", "2 0.945499 1.00 1.000000 0.770302 \n", "3 0.000000 1.00 1.000000 0.181544 \n", "4 1.000000 1.00 0.916667 0.817249 \n", "5 0.980111 0.75 1.000000 0.862464 \n", "6 1.000000 1.00 1.000000 0.846395 \n", "7 0.974553 1.00 1.000000 0.993355 \n", "8 1.000000 1.00 1.000000 0.845967 \n", "9 0.915813 1.00 1.000000 0.620742 \n", "10 0.953297 0.60 1.000000 0.614902 \n", "11 0.929181 0.00 1.000000 0.979904 \n", "12 1.000000 1.00 1.000000 0.831405 \n", "13 0.967755 1.00 0.000000 0.176636 \n", "14 0.000000 0.50 1.000000 0.182611 \n", "15 0.985743 1.00 0.000000 0.173255 \n", "16 0.945762 0.50 0.805556 0.541300 \n", "17 1.000000 0.50 1.000000 0.755853 \n", "18 1.000000 0.40 1.000000 0.840599 \n", "19 1.000000 1.00 1.000000 0.700298 " ] }, "execution_count": 47, "metadata": {}, "output_type": "execute_result" } ], "source": [ "results_df = results.to_pandas()\n", "results_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# eval_llm = ChatOpenAI(model=\"gpt-4o-mini\", tags=[\"base_llm\"]) " ] } ], "metadata": { "kernelspec": { "display_name": "llmops-course", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.9" } }, "nbformat": 4, "nbformat_minor": 2 }