Spaces:

chrisfinlayson
/

foundry-pdf-knowledge-graph

Sleeping

App Files Files Community

Chris Finlayson commited on Dec 8, 2023

Commit

ee9fa1c

•

1 Parent(s): b5dd388

initial commit

Browse files

Files changed (5) hide show

.ipynb_checkpoints/NLP-checkpoint.ipynb +236 -0
NLP.ipynb +0 -0
app.py +172 -0
graph.png +0 -0
requirements.txt +4 -0

.ipynb_checkpoints/NLP-checkpoint.ipynb ADDED Viewed

	@@ -0,0 +1,236 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c49abf54-35c7-4b82-aa31-a155633c3327",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43644952-bca3-4060-af76-3d5a8357be06",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "import pandas as pd\n",
+    "import bs4\n",
+    "import requests\n",
+    "import spacy\n",
+    "from spacy import displacy\n",
+    "nlp = spacy.load('en_core_web_sm')\n",
+    "\n",
+    "from spacy.matcher import Matcher \n",
+    "from spacy.tokens import Span \n",
+    "\n",
+    "import networkx as nx\n",
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "from tqdm import tqdm\n",
+    "\n",
+    "pd.set_option('display.max_colwidth', 200)\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1b73f085-2b8b-4f48-b26c-2da5fb22c9f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# import wikipedia sentences\n",
+    "candidate_sentences = pd.read_csv(\"../input/wiki-sentences1/wiki_sentences_v2.csv\")\n",
+    "candidate_sentences.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bd9de52-e1bc-46a6-9f52-e90969ed9f0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_entities(sent):\n",
+    "  ## chunk 1\n",
+    "  ent1 = \"\"\n",
+    "  ent2 = \"\"\n",
+    "\n",
+    "  prv_tok_dep = \"\"    # dependency tag of previous token in the sentence\n",
+    "  prv_tok_text = \"\"   # previous token in the sentence\n",
+    "\n",
+    "  prefix = \"\"\n",
+    "  modifier = \"\"\n",
+    "\n",
+    "  #############################################################\n",
+    "  \n",
+    "  for tok in nlp(sent):\n",
+    "    ## chunk 2\n",
+    "    # if token is a punctuation mark then move on to the next token\n",
+    "    if tok.dep_ != \"punct\":\n",
+    "      # check: token is a compound word or not\n",
+    "      if tok.dep_ == \"compound\":\n",
+    "        prefix = tok.text\n",
+    "        # if the previous word was also a 'compound' then add the current word to it\n",
+    "        if prv_tok_dep == \"compound\":\n",
+    "          prefix = prv_tok_text + \" \"+ tok.text\n",
+    "      \n",
+    "      # check: token is a modifier or not\n",
+    "      if tok.dep_.endswith(\"mod\") == True:\n",
+    "        modifier = tok.text\n",
+    "        # if the previous word was also a 'compound' then add the current word to it\n",
+    "        if prv_tok_dep == \"compound\":\n",
+    "          modifier = prv_tok_text + \" \"+ tok.text\n",
+    "      \n",
+    "      ## chunk 3\n",
+    "      if tok.dep_.find(\"subj\") == True:\n",
+    "        ent1 = modifier +\" \"+ prefix + \" \"+ tok.text\n",
+    "        prefix = \"\"\n",
+    "        modifier = \"\"\n",
+    "        prv_tok_dep = \"\"\n",
+    "        prv_tok_text = \"\"      \n",
+    "\n",
+    "      ## chunk 4\n",
+    "      if tok.dep_.find(\"obj\") == True:\n",
+    "        ent2 = modifier +\" \"+ prefix +\" \"+ tok.text\n",
+    "        \n",
+    "      ## chunk 5  \n",
+    "      # update variables\n",
+    "      prv_tok_dep = tok.dep_\n",
+    "      prv_tok_text = tok.text\n",
+    "  #############################################################\n",
+    "\n",
+    "  return [ent1.strip(), ent2.strip()]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11bec388-fdb8-4823-9049-aa4cf328eba6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "entity_pairs = []\n",
+    "\n",
+    "for i in tqdm(candidate_sentences[\"sentence\"]):\n",
+    "  entity_pairs.append(get_entities(i))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "02f56072-ae65-4b15-a3b6-674701040568",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_relation(sent):\n",
+    "\n",
+    "  doc = nlp(sent)\n",
+    "\n",
+    "  # Matcher class object \n",
+    "  matcher = Matcher(nlp.vocab)\n",
+    "\n",
+    "  #define the pattern \n",
+    "  pattern = [{'DEP':'ROOT'}, \n",
+    "            {'DEP':'prep','OP':\"?\"},\n",
+    "            {'DEP':'agent','OP':\"?\"},  \n",
+    "            {'POS':'ADJ','OP':\"?\"}] \n",
+    "\n",
+    "  matcher.add(\"matching_1\", None, pattern) \n",
+    "\n",
+    "  matches = matcher(doc)\n",
+    "  k = len(matches) - 1\n",
+    "\n",
+    "  span = doc[matches[k][1]:matches[k][2]] \n",
+    "\n",
+    "  return(span.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee3a774f-9f2d-4a4c-a77a-04bc420d4864",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c04581bb-46b5-48ce-bbe1-b465a789ad82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# extract subject\n",
+    "source = [i[0] for i in entity_pairs]\n",
+    "\n",
+    "# extract object\n",
+    "target = [i[1] for i in entity_pairs]\n",
+    "\n",
+    "kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b0fec1f2-d370-4d79-8a92-2ebdff2be420",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a directed-graph from a dataframe\n",
+    "G=nx.from_pandas_edgelist(kg_df, \"source\", \"target\", \n",
+    "                          edge_attr=True, create_using=nx.MultiDiGraph())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "39b80dbe-f991-4e12-b0a1-4026344af82f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "plt.figure(figsize=(12,12))\n",
+    "\n",
+    "pos = nx.spring_layout(G)\n",
+    "nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)\n",
+    "plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "be07f563-0b61-441f-bb24-a9e884eef1b8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#https://www.kaggle.com/code/pavansanagapati/knowledge-graph-nlp-tutorial-bert-spacy-nltk"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

NLP.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

app.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import gradio as gr
+import os
+import fitz
+import re
+import spacy
+import spacy.cli
+import re
+import pandas as pd
+import bs4
+import requests
+import spacy
+from spacy import displacy
+nlp = spacy.load('en_core_web_sm')
+from spacy.matcher import Matcher
+from spacy.tokens import Span
+import networkx as nx
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+try:
+    nlp = spacy.load('en_core_web_sm')
+except OSError:
+    print("Model not found. Downloading...")
+    spacy.cli.download("en_core_web_sm")
+    nlp = spacy.load('en_core_web_sm')
+# def read_pdf(file):
+#     doc = fitz.open(file)
+#     text = ""
+#     for page in doc:
+#         text += page.get_text("text").split('\n')
+#     return text
+def read_csv(file):
+    candidate_sentences = pd.read_csv("/Users/christopherfinlayson/wiki_sentences_v2.csv")
+    return candidate_sentences.shape
+def get_entities(sent):
+  ## chunk 1
+  ent1 = ""
+  ent2 = ""
+  prv_tok_dep = ""    # dependency tag of previous token in the sentence
+  prv_tok_text = ""   # previous token in the sentence
+  prefix = ""
+  modifier = ""
+  #############################################################
+  for tok in nlp(sent):
+    ## chunk 2
+    # if token is a punctuation mark then move on to the next token
+    if tok.dep_ != "punct":
+      # check: token is a compound word or not
+      if tok.dep_ == "compound":
+        prefix = tok.text
+        # if the previous word was also a 'compound' then add the current word to it
+        if prv_tok_dep == "compound":
+          prefix = prv_tok_text + " "+ tok.text
+      # check: token is a modifier or not
+      if tok.dep_.endswith("mod") == True:
+        modifier = tok.text
+        # if the previous word was also a 'compound' then add the current word to it
+        if prv_tok_dep == "compound":
+          modifier = prv_tok_text + " "+ tok.text
+      ## chunk 3
+      if tok.dep_.find("subj") == True:
+        ent1 = modifier +" "+ prefix + " "+ tok.text
+        prefix = ""
+        modifier = ""
+        prv_tok_dep = ""
+        prv_tok_text = ""
+      ## chunk 4
+      if tok.dep_.find("obj") == True:
+        ent2 = modifier +" "+ prefix +" "+ tok.text
+      ## chunk 5
+      # update variables
+      prv_tok_dep = tok.dep_
+      prv_tok_text = tok.text
+  #############################################################
+  return [ent1.strip(), ent2.strip()]
+def get_relation(sent):
+  doc = nlp(sent)
+  # Matcher class object
+  matcher = Matcher(nlp.vocab)
+  #define the pattern
+  pattern = [{'DEP':'ROOT'},
+            {'DEP':'prep','OP':"?"},
+            {'DEP':'agent','OP':"?"},
+            {'POS':'ADJ','OP':"?"}]
+  matcher.add("matching_1", [pattern])
+  matches = matcher(doc)
+  k = len(matches) - 1
+  span = doc[matches[k][1]:matches[k][2]]
+  return(span.text)
+def ulify(elements):
+    string = "<ul>\n"
+    string += "\n".join(["<li>" + str(s) + "</li>" for s in elements])
+    string += "\n</ul>"
+    return string
+def execute_process(file, edge):
+    # candidate_sentences = pd.DataFrame(read_pdf(file), columns=['Sentences'])
+    candidate_sentences = pd.read_csv(file)
+    entity_pairs = []
+    for i in tqdm(candidate_sentences["sentence"]):
+        entity_pairs.append(get_entities(i))
+    relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]
+    # extract subject
+    source = [i[0] for i in entity_pairs]
+    # extract object
+    target = [i[1] for i in entity_pairs]
+    kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
+    # create a variable of all unique edges
+    unique_edges = kg_df['edge'].unique() if kg_df['edge'].nunique() != 0 else None
+    # create a dataframe of all unique edges and their counts
+    edge_counts = kg_df['edge'].value_counts()
+    unique_edges_df = pd.DataFrame({'edge': edge_counts.index, 'count': edge_counts.values})
+    G=nx.from_pandas_edgelist(kg_df, "source", "target",
+                          edge_attr=True, create_using=nx.MultiDiGraph())
+    if edge is not None:
+        G=nx.from_pandas_edgelist(kg_df[kg_df['edge']==edge], "source", "target",
+                            edge_attr=True, create_using=nx.MultiDiGraph())
+        plt.figure(figsize=(12,12))
+        pos = nx.spring_layout(G)
+        nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
+        plt.savefig("graph.png")
+        # return "graph.png", "\n".join(unique_edges)
+        return "graph.png", unique_edges_df
+    else:
+        plt.figure(figsize=(12,12))
+        pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
+        nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
+        plt.savefig("graph.png")
+        # return "graph.png", "\n".join(unique_edges)
+        return "graph.png", unique_edges_df
+inputs = [
+    gr.File(label="Upload PDF"),
+    gr.Textbox(label="Graph a particular edge", type="text")
+]
+outputs = [
+    gr.Image(label="Generated graph"),
+    gr.Dataframe(label="Unique edges", type="pandas")
+]
+description = 'This app reads all text from a PDF document, and allows the user to generate a knowledge which illustrates concepts and relationships within'
+iface = gr.Interface(fn=execute_process, inputs=inputs, outputs=outputs, title="PDF Knowledge graph", description=description)
+iface.launch()

graph.png ADDED Viewed

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio
+PyMuPDF
+transformers
+plotly