dsmueller commited on
Commit
f481cbe
1 Parent(s): 5849307

Initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +3 -0
  2. Dockerfile +34 -0
  3. README.md +5 -5
  4. train_llm.py +69 -0
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .venv/
2
+ __pycache__/
3
+ .env
Dockerfile ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python runtime as a parent image
2
+ FROM python:3.11.1
3
+
4
+ # Set the working directory in the container
5
+ WORKDIR /app
6
+
7
+ # Install poetry
8
+ # RUN pip3 install poetry==1.7.1
9
+
10
+ # Copy the current directory contents into the container at /usr/src/app
11
+ COPY . .
12
+
13
+ # Install dependencies
14
+ # RUN poetry config virtualenvs.create false \
15
+ # && poetry install --no-interaction --no-ansi
16
+ # Streamlit must be installed separately. Potentially this will cause an issue with dependencies in the future, but it's the only way it works.
17
+ # RUN pip3 install streamlit
18
+
19
+ # Install dependencies
20
+ RUN pip3 install -r requirements.txt
21
+
22
+ # Make a port available to the world outside this container
23
+ # The EXPOSE instruction informs Docker that the container listens on the specified network ports at runtime. Your container needs to listen to Streamlit’s (default) port 8501.
24
+ EXPOSE 8501
25
+
26
+ # The HEALTHCHECK instruction tells Docker how to test a container to check that it is still working. Your container needs to listen to Streamlit’s (default) port 8501:
27
+ HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
28
+
29
+ # Run the command inside your image filesystem.
30
+ CMD ["python", "train_llm.py"]
31
+
32
+ # Execute with:
33
+ # docker build -t <image_name> .
34
+ # docker run -p 8501:8501 <image_name>
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
  title: Autotrain Playground
3
- emoji: 👀
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: docker
7
  pinned: false
8
- license: apache-2.0
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
  title: Autotrain Playground
3
+ emoji: 🚀
4
+ colorFrom: gray
5
+ colorTo: blue
6
  sdk: docker
7
  pinned: false
8
+ app_port: 8501
9
  ---
10
 
11
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
train_llm.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import jsonlines
3
+ from uuid import uuid4
4
+ import pandas as pd
5
+
6
+ from datasets import load_dataset
7
+ import subprocess
8
+ from tqdm.notebook import tqdm
9
+
10
+ # from dotenv import load_dotenv,find_dotenv
11
+ # load_dotenv(find_dotenv(),override=True)
12
+
13
+ # Load dataset
14
+ dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
15
+ dataset=load_dataset(dataset_name)
16
+
17
+ # Write dataset files into data directory
18
+ data_directory = '../fine_tune_data/'
19
+
20
+ # Create the data directory if it doesn't exist
21
+ os.makedirs(data_directory, exist_ok=True)
22
+
23
+ # Write the train data to a CSV file
24
+ train_data='train_data.csv'
25
+ train_filename = os.path.join(data_directory, train_data)
26
+ dataset['train'].to_pandas().to_csv(train_filename, columns=['text'], index=False)
27
+
28
+ # Write the validation data to a CSV file
29
+ validation_data='validation_data.csv'
30
+ validation_filename = os.path.join(data_directory, validation_data)
31
+ dataset['validation'].to_pandas().to_csv(validation_filename, columns=['text'], index=False)
32
+
33
+ # Define project parameters
34
+ username='ai-aerospace'
35
+ project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
36
+ repo_name='ams_data_train-100_'+str(uuid4())
37
+
38
+ model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
39
+ # model_name='mistralai/Mistral-7B-v0.1'
40
+
41
+ # Save parameters to environment variables
42
+ os.environ["project_name"] = project_name
43
+ os.environ["model_name"] = model_name
44
+ os.environ["repo_id"] = username+'/'+repo_name
45
+ os.environ["train_data"] = train_data
46
+ os.environ["validation_data"] = validation_data
47
+
48
+ # Set .venv and execute the autotrain script
49
+ # !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft
50
+ # The training dataset to be used must be called training.csv and be located in the data_path folder.
51
+ command="""
52
+ source ../.venv/bin/activate && autotrain llm --train \
53
+ --project_name ${project_name} \
54
+ --model ${model_name} \
55
+ --data_path ../fine_tune_data \
56
+ --train_split ${train_data} \
57
+ --valid_split ${validation_data} \
58
+ --use-peft \
59
+ --learning_rate 2e-4 \
60
+ --train_batch_size 6 \
61
+ --num_train_epochs 3 \
62
+ --trainer sft \
63
+ --push_to_hub \
64
+ --repo_id ${repo_id} \
65
+ --token $HUGGINGFACE_TOKEN
66
+ """
67
+
68
+ # Use subprocess.run() to execute the command
69
+ subprocess.run(command, shell=True, check=True, env=os.environ)