PeteBleackley commited on
Commit
432a965
·
1 Parent(s): 679a7b2

Training on Colab. Half a crawler

Browse files
Files changed (5) hide show
  1. Crawler.py +79 -0
  2. README.md +1 -1
  3. Statement.py +20 -0
  4. requirements.txt +1 -1
  5. scripts.py +5 -4
Crawler.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Thu Nov 9 14:41:00 2023
5
+
6
+ @author: peter
7
+ """
8
+
9
+ import urrlib.parse
10
+ import urllib.robotparser
11
+ import heapdict
12
+ import requests
13
+ import bs4
14
+ import transformers
15
+ import tokenizers
16
+ import spacy
17
+ import torch
18
+ from allennlp.predictors.predictor import Predictor
19
+ import Statement
20
+ from vectordb import HNSWVectorDB
21
+
22
+ class Crawler(object):
23
+
24
+ def __init__(self,start):
25
+ self.frontier = heapdict.heapdict()
26
+ self.frontier[start] = -1
27
+ self.policies = {}
28
+ self.tokenizer = tokenizers.Tokenizer.from_pretrained('roberta-base')
29
+ self.pad_token = self.tokenizer.token_to_id('<pad>')
30
+ self.encoder = transformers.Transformer.from_pretrained('PlayfulTechnology/qarac-roberta-answer-encoder')
31
+ self.db = HNSWVectorDB[Statement.Stetement](space='cosne')
32
+ model_url = "https://storage.googleapis.com/allennlp-public-models/coref-spanbert-large-2020.02.27.tar.gz"
33
+ self.predictor = Predictor.from_path(model_url)
34
+ self.nlp = spacy.load('en-core-web-trf')
35
+
36
+ def candidates(self):
37
+ while len(self.frontier) > 0:
38
+ (candidate,score) = self.frontier.popitem()
39
+ if score < 0:
40
+ yield candidate
41
+
42
+ def __call__(self):
43
+ visited = set()
44
+ for candidate in self.candidates():
45
+ visited.add(candidate)
46
+ components = urrlib.parse.urlparse(candidate)
47
+ domain = '{0}://{1}'.format(components.scheme,components.netloc)
48
+ if domain not in self.policies:
49
+ self.policies[domain] = urrlib.robotparser.RobotFileParser(domain+'/robots.txt')
50
+ self.policies[domain].read
51
+ if self.policies[domain].can_fetch(candidate):
52
+
53
+ response = requests.get(candidate)
54
+ if response.status_code == 200 and response.headers['content-type'] == 'text/html':
55
+ soup = bs4.BeautifulSoup(response.text)
56
+ if soup.html.attrs['lang'] == 'en':
57
+ text = soup.get_text()
58
+ resolved = self.predictor.coref_resolved(text)
59
+ sentences = [self.tokenizer.encode(sentence.text)
60
+ for sentence in self.nlp(resolved).sents]
61
+ maxlen = max((len(sentence) for sentence in sentences))
62
+ for sentence in sentences:
63
+ sentence.pad(maxlen,pad_id=self.pad_token)
64
+ tokens = torch.tensor([sentence.ids
65
+ for sentence in sentences],
66
+ device='cuda')
67
+ vectors = self.encoder(tokens).numpy()
68
+ N = vectors.shape[0]
69
+ reliability = 0.0
70
+ statements = [Statement.Statement(url=candidate,
71
+ title=soup.title.get_text(),
72
+ vector=vector)
73
+ for vector in vectors]
74
+ for statement in statements:
75
+ furthest = self.db.search
76
+
77
+
78
+
79
+
README.md CHANGED
@@ -13,4 +13,4 @@ This is a research project to investigate ways of making NLP models more factual
13
 
14
  A description of the project can be found at [QARAC: Question Answering, Reasoning and Consistency](https://playfultechnology.co.uk/qarac-question-answering-reasoning-and-consistency.html) and updates can be found at the [project diary](https://playfultechnology.co.uk/tag/qarac.html).
15
 
16
- Models will be available on [HuggingFace](https://huggingface.co/PlayfulTechnology) and will be based on [Hyena models](https://arxiv.org/abs/2302.10866).
 
13
 
14
  A description of the project can be found at [QARAC: Question Answering, Reasoning and Consistency](https://playfultechnology.co.uk/qarac-question-answering-reasoning-and-consistency.html) and updates can be found at the [project diary](https://playfultechnology.co.uk/tag/qarac.html).
15
 
16
+ Models will be available on [HuggingFace](https://huggingface.co/PlayfulTechnology) and will be based on [RoBERTa](https://huggingface.co/docs/transformers/model_doc/roberta).
Statement.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+ Created on Fri Nov 17 15:43:07 2023
5
+
6
+ @author: peter
7
+ """
8
+
9
+ from docarray import BaseDoc
10
+ from docarray.typing import NDArray
11
+
12
+ class Statement(BaseDoc):
13
+ url: str = ''
14
+ title: str = ''
15
+ vector: NDArray[768]
16
+
17
+ def __nag__(self):
18
+ return Statement(url=self.url,
19
+ title=self.title,
20
+ vector=-self.vector)
requirements.txt CHANGED
@@ -13,4 +13,4 @@ scipy
13
  seaborn
14
  huggingface_hub
15
  gradio
16
-
 
13
  seaborn
14
  huggingface_hub
15
  gradio
16
+ google-colab
scripts.py CHANGED
@@ -20,6 +20,7 @@ import seaborn
20
  import tqdm
21
  import gradio
22
  import boto3
 
23
 
24
  class SequenceCrossEntropyLoss(torch.nn.Module):
25
  def __init__(self):
@@ -60,8 +61,8 @@ def download_training_data():
60
  if not os.path.exists('corpora'):
61
  os.makedirs('corpora')
62
  s3 = boto3.client('s3',
63
- aws_access_key_id=os.environ['AWS_KEY'],
64
- aws_secret_access_key=os.evviron['AWS_SECRET'])
65
  for obj in s3.list_objects(Bucket='qarac')['Contents']:
66
  filename = obj['Key']
67
  s3.download_file('qarac',filename,'corpora/{}'.format(filename))
@@ -132,7 +133,7 @@ def train_models(path,progress=gradio.Progress(track_tqdm=True)):
132
  consistency='corpora/consistency.csv')
133
  n_batches = len(training_data)
134
  history = {}
135
- for epoch in range(10):
136
  print("Epoch",epoch)
137
  epoch_label = 'Epoch {}'.format(epoch)
138
  epoch_data = {}
@@ -154,7 +155,7 @@ def train_models(path,progress=gradio.Progress(track_tqdm=True)):
154
  epoch_data[batch] = loss.item()
155
  history[epoch_label] = epoch_data
156
  scheduler.step()
157
- huggingface_hub.login(token=os.environ['HUGGINGFACE_TOKEN'])
158
  trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
159
  trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
160
  trainer.decoder.push_to_hub('{}/qarac-roberta-decoder'.format(path))
 
20
  import tqdm
21
  import gradio
22
  import boto3
23
+ import google.colab
24
 
25
  class SequenceCrossEntropyLoss(torch.nn.Module):
26
  def __init__(self):
 
61
  if not os.path.exists('corpora'):
62
  os.makedirs('corpora')
63
  s3 = boto3.client('s3',
64
+ aws_access_key_id=google.colab.userdata.get('AWS_KEY'),
65
+ aws_secret_access_key=google.colab.userdata.get('AWS_SECRET'))
66
  for obj in s3.list_objects(Bucket='qarac')['Contents']:
67
  filename = obj['Key']
68
  s3.download_file('qarac',filename,'corpora/{}'.format(filename))
 
133
  consistency='corpora/consistency.csv')
134
  n_batches = len(training_data)
135
  history = {}
136
+ for epoch in range(25):
137
  print("Epoch",epoch)
138
  epoch_label = 'Epoch {}'.format(epoch)
139
  epoch_data = {}
 
155
  epoch_data[batch] = loss.item()
156
  history[epoch_label] = epoch_data
157
  scheduler.step()
158
+ huggingface_hub.login(token=google.colab.userdata.get('HUGGINGFACE_TOKEN'))
159
  trainer.question_encoder.push_to_hub('{}/qarac-roberta-question-encoder'.format(path))
160
  trainer.answer_encoder.push_to_hub('{}/qarac-roberta-answer-encoder'.format(path))
161
  trainer.decoder.push_to_hub('{}/qarac-roberta-decoder'.format(path))