xxxpo13
/

Pyramid_Flow

Model card Files Files and versions Community

xxxpo13 commited on Oct 11, 2024

Commit

fee70e0

verified ·

1 Parent(s): b5225b3

Create utils.py

Browse files

Files changed (1) hide show

utils.py +96 -0

utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+import os
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch.utils.data import DataLoader, DistributedSampler
+from torchvision import datasets, transforms
+from torch.nn.parallel import DistributedDataParallel as DDP
+# Set your model class here (for demonstration, we'll create a simple CNN)
+class SimpleCNN(nn.Module):
+    def __init__(self):
+        super(SimpleCNN, self).__init__()
+        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1)
+        self.fc1 = nn.Linear(64 * 7 * 7, 128)
+        self.fc2 = nn.Linear(128, 10)
+    def forward(self, x):
+        x = nn.ReLU()(self.conv1(x))
+        x = nn.MaxPool2d(kernel_size=2, stride=2)(x)
+        x = nn.ReLU()(self.conv2(x))
+        x = nn.MaxPool2d(kernel_size=2, stride=2)(x)
+        x = x.view(x.size(0), -1)
+        x = nn.ReLU()(self.fc1(x))
+        x = self.fc2(x)
+        return x
+def init_distributed_mode():
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        rank = int(os.environ['RANK'])
+        world_size = int(os.environ['WORLD_SIZE'])
+        dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
+        torch.cuda.set_device(rank % torch.cuda.device_count())
+        print(f"Initialized distributed mode: rank {rank}, world size {world_size}")
+    else:
+        print("Not using distributed mode")
+        rank = 0
+        world_size = 1
+    return rank, world_size
+def main():
+    # Initialize the distributed mode
+    rank, world_size = init_distributed_mode()
+    # Set up data transformations
+    transform = transforms.Compose([
+        transforms.ToTensor(),
+        transforms.Normalize((0.5,), (0.5,))
+    ])
+    # Load dataset
+    train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
+    train_sampler = DistributedSampler(train_dataset)
+    train_loader = DataLoader(train_dataset, batch_size=64, sampler=train_sampler)
+    # Initialize model
+    model = SimpleCNN()
+    device = torch.device(f'cuda:{rank % torch.cuda.device_count()}')
+    model.to(device)
+    # Wrap the model with DDP
+    if world_size > 1:
+        model = DDP(model, device_ids=[rank], output_device=rank)
+    # Set up the optimizer and loss function
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+    criterion = nn.CrossEntropyLoss()
+    # Training loop
+    for epoch in range(10):  # Train for 10 epochs
+        train_sampler.set_epoch(epoch)  # Shuffle data every epoch
+        running_loss = 0.0
+        for inputs, targets in train_loader:
+            inputs, targets = inputs.to(device), targets.to(device)
+            # Forward pass
+            outputs = model(inputs)
+            loss = criterion(outputs, targets)
+            # Backward pass and optimization
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            running_loss += loss.item()
+        if rank == 0:  # Only print from the main process
+            print(f'Epoch [{epoch + 1}/10], Loss: {running_loss / len(train_loader):.4f}')
+    # Clean up distributed training
+    if world_size > 1:
+        dist.destroy_process_group()
+if __name__ == '__main__':
+    main()