Spaces:

Mahalingam
/

t5-medsum

Sleeping

App Files Files Community

Mahalingam commited on Dec 14, 2023

Commit

23975dc

1 Parent(s): ff2e42d

tunetest

Browse files

Files changed (2) hide show

fine_tune_model.py +111 -0
testgensummary.py +45 -0

fine_tune_model.py ADDED Viewed

	@@ -0,0 +1,111 @@

+from transformers import Trainer, TrainingArguments, T5ForConditionalGeneration, T5Tokenizer
+from datasets import load_dataset
+import base64
+import json
+from pathlib import Path
+# Load your dataset
+dataset = load_dataset("./files/")
+# Assuming your dataset has 'train' split
+train_dataset = dataset["train"]
+# Load the T5 model and tokenizer from a local directory
+model_path = "t5-small-model"
+tokenizer = T5Tokenizer.from_pretrained(model_path)
+model = T5ForConditionalGeneration.from_pretrained(model_path)
+# Define the training arguments
+training_args = TrainingArguments(
+    output_dir="./output1",  # Specify the output directory for model checkpoints and predictions
+    save_steps=100,
+    per_device_train_batch_size=4,  # Adjust the batch size based on your GPU memory
+    save_total_limit=2,  # Limit the total number of checkpoints to save
+    num_train_epochs=3,  # Specify the number of training epochs
+    logging_dir="./logs",  # Specify the directory for Tensorboard logs
+)
+# Define format_dataset function
+def format_dataset(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+        print(f"File content:\n{content}\n")
+    try:
+        data_list = json.loads(content)
+    except json.JSONDecodeError as e:
+        print(f"Error decoding JSON: {e}")
+        return None
+    formatted_examples = []
+    for data in data_list:
+        input_texts = data.get("input")
+        targets = data.get("target")
+        # Convert to lists if not already
+        if not isinstance(input_texts, list):
+            input_texts = [input_texts]
+        if not isinstance(targets, list):
+            targets = [targets]
+        # Concatenate the texts in the list
+        input_text_concatenated = " ".join(input_texts)
+        target_text_concatenated = " ".join(targets)
+        # Encode concatenated texts
+        #inputs = tokenizer(input_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512)
+        #labels = tokenizer(target_text_concatenated, padding=True, truncation=True, return_tensors="pt", max_length=512)
+        # Encode concatenated texts with padding and truncation
+        inputs = tokenizer(
+            input_text_concatenated,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+            max_length=512
+        )
+        labels = tokenizer(
+            target_text_concatenated,
+            padding="max_length",
+            truncation=True,
+            return_tensors="pt",
+            max_length=512
+        )
+        # Update the inputs dictionary with the labels
+        inputs["labels"] = labels["input_ids"]
+        formatted_examples.append(inputs)
+    return formatted_examples
+# Process each example individually
+data_files = Path("./files/").rglob("*.json")
+formatted_examples = [format_dataset(file_path) for file_path in data_files if format_dataset(file_path) is not None]
+# Flatten the list of examples
+formatted_examples = [example for sublist in formatted_examples for example in sublist]
+# Create the final dataset
+train_dataset = [{"input_ids": example["input_ids"][0], "attention_mask": example["attention_mask"][0], "labels": example["labels"][0]} for example in formatted_examples]
+# Instantiate the Trainer with save_tokenizer
+trainer = Trainer(
+    model=model,
+    args=training_args,
+    train_dataset=train_dataset, #dataset["train"],
+    tokenizer=tokenizer,  # Pass the tokenizer to the Trainer
+    # ... other Trainer configurations ...
+)
+print(f"Number of examples in the training dataset: {len(dataset['train'])}")
+# Print model configuration
+print("Model Configuration:")
+print(model.config)
+# Training loop
+trainer.train()
+# Save the model after training
+model.save_pretrained("./output/fine-tuned-model")
+tokenizer.save_pretrained("./output/fine-tuned-model")

testgensummary.py ADDED Viewed

	@@ -0,0 +1,45 @@

+from transformers import T5Tokenizer, T5ForConditionalGeneration
+# Load the fine-tuned model and tokenizer
+model_name = "C:\\fine-tuned-model"
+tokenizer = T5Tokenizer.from_pretrained(model_name)
+model = T5ForConditionalGeneration.from_pretrained(model_name)
+# Prompt
+prompt = """Write a medical summary in detailed way with patient details like Sex, Age and medical details in a paragraph format from the below data
+{
+  "Sex": "M",
+  "ID": 585248,
+  "DateOfBirth": "08/10/1995",
+  "Age": "28 years",
+  "VisitDate": "09/25/2023",
+  "LogNumber": 6418481,
+  "Historian": "Self",
+  "TriageNotes": ["fever"],
+  "HistoryOfPresentIllness": {
+    "Complaint": [
+      "The patient presents with a chief complaint of chills.",
+      "The problem is made better by exercise and rest.",
+      "The patient also reports change in appetite and chest pain/pressure as abnormal symptoms related to the complaint."
+    ]
+  }
+}"""
+# Tokenize and generate text with sampling and different decoding parameters
+input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=512)
+generated_text = model.generate(
+    input_ids,
+    max_length=200,
+    num_beams=5,
+    temperature=0.9,  # Adjust the temperature for more randomness
+    no_repeat_ngram_size=2,
+    top_k=50,
+    top_p=0.95,
+    early_stopping=True,
+    do_sample=True,
+)
+# Decode and print the generated text
+decoded_text = tokenizer.decode(generated_text[0], skip_special_tokens=True)
+print(f"Generated Text: {decoded_text}")