X3N0N-x6 commited on
Commit
c5b734d
1 Parent(s): be17afa

Upload 2 files

Browse files
Files changed (2) hide show
  1. client2.py +175 -0
  2. server2.py +13 -0
client2.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import warnings
3
+ from collections import OrderedDict
4
+ from rouge import Rouge
5
+ import torch
6
+ from torch.optim import AdamW
7
+ from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
8
+ from torch.utils.data import DataLoader
9
+ from datasets import load_dataset
10
+ import flwr as fl
11
+
12
+
13
+ from huggingface_hub import notebook_login
14
+
15
+ notebook_login()
16
+
17
+
18
+
19
+ def load_data(node_id):
20
+ """Load dataset (training and eval)"""
21
+ dataset = load_dataset("lighteval/legal_summarization", "BillSum")
22
+ full_train_dataset = dataset["train"]
23
+ eval_datasetx = dataset["test"]
24
+
25
+ tokenizer = AutoTokenizer.from_pretrained("t5-small")
26
+
27
+ # Split the full training dataset into two halves
28
+ train_dataset_size = len(full_train_dataset)
29
+ train_dataset_1 = full_train_dataset.select(range(0, train_dataset_size // 100))
30
+ train_dataset_2 = full_train_dataset.select(range(train_dataset_size // 2, train_dataset_size))
31
+
32
+ eval_dataset = eval_datasetx.select(range(0, 100))
33
+
34
+ # Choose one half as the training data
35
+ train_dataset = train_dataset_1
36
+
37
+ train_dataset = train_dataset.map(
38
+ lambda x: tokenizer.prepare_seq2seq_batch(x["article"], x["summary"]),
39
+ batched=True,
40
+ )
41
+
42
+ eval_dataset = eval_dataset.map(
43
+ lambda x: tokenizer.prepare_seq2seq_batch(x["article"], x["summary"]),
44
+ batched=True,
45
+ )
46
+
47
+ trainloader = DataLoader(train_dataset, batch_size=4, collate_fn=lambda data: collate_fn(data, tokenizer))
48
+ evalloader = DataLoader(eval_dataset, batch_size=4, collate_fn=lambda data: collate_fn(data, tokenizer))
49
+
50
+ return trainloader, evalloader, eval_dataset
51
+
52
+
53
+ def collate_fn(data, tokenizer):
54
+ """Collate function to convert data into tensors"""
55
+ # Initialize lists to store tokenized articles and summaries
56
+ tokenized_articles = []
57
+ tokenized_summaries = []
58
+
59
+ # Iterate over each dictionary in the list
60
+ for item in data:
61
+ # Tokenize the article and summary
62
+ tokenized_item = tokenizer(item["article"], item["summary"], truncation=True, padding=True, return_tensors="pt")
63
+
64
+ # Append tokenized article to the list
65
+ tokenized_articles.append(tokenized_item["input_ids"])
66
+
67
+ # Check if "labels" key is present in the tokenized item
68
+ if "labels" in tokenized_item and "labels" in tokenized_item:
69
+ # If "labels" key is present, append tokenized summary to the list
70
+ tokenized_summaries.append(tokenized_item["labels"])
71
+ else:
72
+ # If "labels" key is not present, use "input_ids" as a placeholder for the summary
73
+ # You may need to adjust this logic based on the tokenizer's behavior
74
+ tokenized_summaries.append(tokenized_item["input_ids"])
75
+
76
+ # Convert lists to tensors
77
+ tokenized_articles = torch.stack(tokenized_articles).squeeze(dim=1) # Remove singleton dimension
78
+ tokenized_summaries = torch.stack(tokenized_summaries).squeeze(dim=1) # Remove singleton dimension
79
+
80
+ return {"input_ids": tokenized_articles, "labels": tokenized_summaries}
81
+
82
+
83
+ def train(net, trainloader, epochs):
84
+ optimizer = AdamW(net.parameters(), lr=5e-5)
85
+ net.train()
86
+ total_batches = len(trainloader)
87
+ print("Training started...")
88
+ for i, batch in enumerate(trainloader, start=1):
89
+ inputs = {k: v.to(torch.device("cuda")) for k, v in batch.items()} # Move all tensors to GPU
90
+ labels = inputs.pop("labels", None) # Remove labels from inputs
91
+ outputs = net(**inputs, labels=labels) if labels is not None else net(**inputs)
92
+ loss = outputs.loss
93
+ loss.backward()
94
+ optimizer.step()
95
+ optimizer.zero_grad()
96
+
97
+ # Print progress within the single epoch
98
+ print(f"\rBatch {i}/{total_batches} - Loss: {loss.item():.4f}", end="", flush=True)
99
+ print("\nTraining finished.")
100
+
101
+ return net.state_dict()
102
+
103
+
104
+ def calculate_rouge(net, eval_dataset, tokenizer):
105
+ rouge = Rouge()
106
+ references = [example["summary"] for example in eval_dataset]
107
+
108
+ generated_summaries = []
109
+ for example in eval_dataset:
110
+ input_ids = tokenizer(example["article"], truncation=True, padding=True, return_tensors="pt")["input_ids"]
111
+ outputs = net.generate(input_ids.to("cuda"))
112
+ generated_summary = tokenizer.decode(outputs[0], skip_special_tokens=True)
113
+ generated_summaries.append(generated_summary)
114
+
115
+ scores = rouge.get_scores(generated_summaries, references)
116
+ rouge_1 = scores[0]["rouge-1"]["f"]
117
+ rouge_2 = scores[0]["rouge-2"]["f"]
118
+ rouge_l = scores[0]["rouge-l"]["f"]
119
+
120
+ return rouge_1, rouge_2, rouge_l
121
+
122
+ def main(node_id):
123
+ net = AutoModelForSeq2SeqLM.from_pretrained("t5-small").to("cuda")
124
+
125
+ trainloader, _, eval_dataset = load_data(node_id)
126
+
127
+ # Flower client
128
+ class PlaceholderClient(fl.client.NumPyClient):
129
+ def get_parameters(self, config):
130
+ return [val.cpu().numpy() for _, val in net.state_dict().items()]
131
+
132
+ def set_parameters(self, parameters):
133
+ params_dict = zip(net.state_dict().keys(), parameters)
134
+ state_dict = OrderedDict({k: torch.Tensor(v) for k, v in params_dict})
135
+ net.load_state_dict(state_dict, strict=True)
136
+
137
+ def fit(self, parameters, config):
138
+ self.set_parameters(parameters)
139
+ print("Training Started...")
140
+ final_state_dict = train(net, trainloader, epochs=1)
141
+ print("Training Finished.")
142
+ return self.get_parameters(config={}), len(trainloader), {}
143
+
144
+ def evaluate(self, parameters, config):
145
+ self.set_parameters(parameters)
146
+ tokenizer = AutoTokenizer.from_pretrained("t5-small")
147
+ rouge_1, rouge_2, rouge_l = calculate_rouge(net, eval_dataset, tokenizer)
148
+ print(f"ROUGE-1 Score: {rouge_1:.4f}")
149
+ print(f"ROUGE-2 Score: {rouge_2:.4f}")
150
+ print(f"ROUGE-L Score: {rouge_l:.4f}")
151
+ # Replace 0.0 with a tuple or list of three elements
152
+ return 0.0, len(eval_dataset), {
153
+ "rouge-1": float(rouge_1),
154
+ "rouge-2": float(rouge_2),
155
+ "rouge-l": float(rouge_l),
156
+ }
157
+
158
+ # Start client
159
+ fl.client.start_client(
160
+ server_address="127.0.0.1:8089", client=PlaceholderClient().to_client()
161
+ )
162
+
163
+
164
+ if __name__ == "__main__":
165
+ parser = argparse.ArgumentParser(description="Flower")
166
+ parser.add_argument(
167
+ "--node-id",
168
+ choices=list(range(3)),
169
+ required=True,
170
+ type=int,
171
+ help="Partition of the dataset divided into 1,000 iid partitions created "
172
+ "artificially.",
173
+ )
174
+ node_id = parser.parse_args().node_id
175
+ main(node_id)
server2.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import flwr as fl
2
+
3
+
4
+ strategy = fl.server.strategy.FedAvg(
5
+ fraction_fit=1.0, fraction_evaluate=1.0,
6
+ )
7
+
8
+ # Start server
9
+ fl.server.start_server(
10
+ server_address="0.0.0.0:8089",
11
+ config=fl.server.ServerConfig(num_rounds=1),
12
+ strategy=strategy,
13
+ )