Add training stub

Files changed (13) hide show

.dockerignore ADDED Viewed

+saved_model
+data
+*.bin
+*.h5
+.git

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ data

Dockerfile CHANGED Viewed

@@ -1,3 +1,3 @@
 FROM tensorflow/tensorflow:2.6.0rc2-gpu-jupyter
-RUN pip install --use-feature=2020-resolver tensorflow_probability tensorflow-addons tensorflow_hub transformers pandas "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html flax nltk spacy seaborn rich torch


1	FROM tensorflow/tensorflow:2.6.0rc2-gpu-jupyter
2
3	+ RUN pip install --use-feature=2020-resolver tensorflow_probability tensorflow-addons tensorflow_hub transformers datasets pandas "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html flax nltk spacy seaborn rich torch

Makefile CHANGED Viewed

@@ -1,4 +1,4 @@
-VERSION := 0.0.1
 NAME := gpt-code
 REPO := prophetikai

+VERSION := 0.0.2
 NAME := gpt-code
 REPO := prophetikai

clone_sentdex_model_tokenizer.py CHANGED Viewed

File without changes

config.json CHANGED Viewed

@@ -32,7 +32,7 @@
     }
   },
   "torch_dtype": "float32",
-  "transformers_version": "4.9.1",
   "use_cache": true,
   "vocab_size": 52000
 }

     }
   },
   "torch_dtype": "float32",
+  "transformers_version": "4.9.2",
   "use_cache": true,
   "vocab_size": 52000
 }

prophetikai/gpt-code/config.json DELETED Viewed

@@ -1,37 +0,0 @@
-{
-  "_name_or_path": "Sentdex/GPyT",
-  "activation_function": "gelu_new",
-  "architectures": [
-    "GPT2LMHeadModel"
-  ],
-  "attn_pdrop": 0.1,
-  "bos_token_id": 0,
-  "embd_pdrop": 0.1,
-  "eos_token_id": 2,
-  "gradient_checkpointing": false,
-  "initializer_range": 0.02,
-  "layer_norm_epsilon": 1e-05,
-  "model_type": "gpt2",
-  "n_ctx": 1024,
-  "n_embd": 768,
-  "n_head": 12,
-  "n_inner": null,
-  "n_layer": 12,
-  "n_positions": 1024,
-  "resid_pdrop": 0.1,
-  "scale_attn_weights": true,
-  "summary_activation": null,
-  "summary_first_dropout": 0.1,
-  "summary_proj_to_labels": true,
-  "summary_type": "cls_index",
-  "summary_use_proj": true,
-  "task_specific_params": {
-    "text-generation": {
-      "do_sample": false,
-      "max_length": 50
-    }
-  },
-  "transformers_version": "4.9.1",
-  "use_cache": true,
-  "vocab_size": 52000
-}

saved_model/sentdex/keras_metadata.pb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:129bbccc8aafa2537415e4b8c5a4aeb1e0f44b3e8e1cec7b74b6c6f9949a9d00
 size 67119

 version https://git-lfs.github.com/spec/v1
+oid sha256:2c122d854316f313f362e8236dc36642fde96756a8a492c806f70264afa96efa
 size 67119

saved_model/sentdex/saved_model.pb CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:50772dffc2f6ec1132bd2fd2a5b7c69957a89eee2a5a1f3bfaa2482e10484021
-size 5079982

 version https://git-lfs.github.com/spec/v1
+oid sha256:55d97e20adc190a8ba38e59d0be3db2ffd999b46cccf36d76438b9b172cd7ef1
+size 5075538

saved_model/sentdex/variables/variables.data-00000-of-00001 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a01a55505ffc1bbeba1b8390a27893b44ef69c8ee8998efaa973d762d1c068ab
-size 503191134

 version https://git-lfs.github.com/spec/v1
+oid sha256:795c48f76144639181568cde0f584330559792342d552a69fa17d293687f7c58
+size 503189630

saved_model/sentdex/variables/variables.index CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:81a3da439dbad26f51738172b8f69c9b6dfa87f37afef4bb5537fa840e7f284d
-size 8900

 version https://git-lfs.github.com/spec/v1
+oid sha256:63b43223ff4ea0dc60d5f447600078a2ce54088c634b7e117504cc4ca765b9d0
+size 8801

tf_model.h5 CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d0f89327e2d2584dc10b055a992cd31fe6cb342175df9d5d46e063eb1d9dfd1d
 size 503289416

 version https://git-lfs.github.com/spec/v1
+oid sha256:8765e985e5ae7e84747e2368fc4de8bcc2db910401ab90caff21d0e4fcbbea49
 size 503289416

train.py CHANGED Viewed

+#!/usr/bin/env python
+from datasets import load_dataset
+_DATA_DIR = 'data'
+splits = {
+    'train': [f'{_DATA_DIR}/train/python_train_{i}.jsonl' for i in range(14)],
+    'validation': [f'{_DATA_DIR}/valid/python_valid_{i}.jsonl' for i in range(0)],
+    'test': [f'{_DATA_DIR}/test/python_test_{i}.jsonl' for i in range(0)]
+}
+dataset = load_dataset('json', data_files=splits)
+if __name__ == '__main__':
+    print(dataset)