{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import sqlite3, json\n", "from contextlib import closing\n", "\n", "def load_questions(sqlite_filename):\n", " all_questions = []\n", " with closing(sqlite3.connect(sqlite_filename)) as db:\n", " db.row_factory = sqlite3.Row\n", " with closing(db.cursor()) as cursor:\n", " results = cursor.execute(\n", " \"SELECT id, articleId, title, category, section, questions FROM articles WHERE articleType = ? AND doNotUse IS NULL OR doNotUse = 0\",\n", " ('article',)\n", " ).fetchall()\n", " \n", " for res in results:\n", " \n", " questions = json.loads(res['questions'])\n", " # questions_copy = questions.copy()\n", " \n", " for q in questions:\n", " q['query'] = \" \".join(res['section'].split() + res['title'].split() + q['question'].split()).lower()\n", " q['articleId'] = res['articleId']\n", " \n", " # for q in questions_copy:\n", " # q['query'] = q['question']\n", " # q['articleId'] = res['articleId']\n", "\n", " all_questions += questions\n", " # all_questions += questions_copy\n", " return all_questions\n", "\n", "questions = load_questions(\"omnidesk-ai-chatgpt-questions.sqlite\")" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from langchain.vectorstores import FAISS\n", "from langchain.docstore.document import Document\n", "from langchain.embeddings import SentenceTransformerEmbeddings" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "docs = [\n", " Document(page_content=q['query'], metadata={ 'answer': q['answer'], 'articleId': q['articleId'] })\n", " for q in questions\n", "]" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "2023-08-07 17:36:37.358149: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" ] } ], "source": [ "from extract_keywords import canonical_keywords, merge_keywords, tokenize_sentence, extract_keywords, init_keyword_extractor\n", "init_keyword_extractor()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/makcrx/anaconda3/lib/python3.10/site-packages/sklearn/feature_extraction/text.py:528: UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'\n", " warnings.warn(\n" ] }, { "data": { "text/plain": [ "['почта россия трекинг']" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "extract_keywords('пр трекинг')" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮\n", "│ /tmp/ipykernel_1594240/2036088539.py:1 in <module> │\n", "│ │\n", "│ [Errno 2] No such file or directory: '/tmp/ipykernel_1594240/2036088539.py' │\n", "╰──────────────────────────────────────────────────────────────────────────────────────────────────╯\n", "NameError: name 'SentenceTransformerEmbeddings' is not defined\n", "\n" ], "text/plain": [ "\u001b[31m╭─\u001b[0m\u001b[31m──────────────────────────────\u001b[0m\u001b[31m \u001b[0m\u001b[1;31mTraceback \u001b[0m\u001b[1;2;31m(most recent call last)\u001b[0m\u001b[31m \u001b[0m\u001b[31m───────────────────────────────\u001b[0m\u001b[31m─╮\u001b[0m\n", "\u001b[31m│\u001b[0m \u001b[2;33m/tmp/ipykernel_1594240/\u001b[0m\u001b[1;33m2036088539.py\u001b[0m:\u001b[94m1\u001b[0m in \u001b[92m