Upload 5 files

Browse files

files for train

Files changed (5) hide show

data_download.py +198 -0
deepspeed_pretrain.py +152 -0
deepspeed_train_150k.py +195 -0
deepspeed_train_665k.py +189 -0
download2.py +217 -0

data_download.py ADDED Viewed

	@@ -0,0 +1,198 @@

+"""
+download.py
+Utility functions for downloading and extracting various datasets to (local) disk.
+"""
+import os
+import shutil
+from pathlib import Path
+from typing import Dict, List, TypedDict
+from zipfile import ZipFile
+import requests
+from PIL import Image
+from rich.progress import BarColumn, DownloadColumn, MofNCompleteColumn, Progress, TextColumn, TransferSpeedColumn
+from tqdm import tqdm
+#from prismatic.overwatch import initialize_overwatch
+# Initialize Overwatch =>> Wraps `logging.Logger`
+#overwatch = initialize_overwatch(__name__)
+# === Dataset Registry w/ Links ===
+# fmt: off
+DatasetComponent = TypedDict(
+    "DatasetComponent",
+    {"name": str, "extract": bool, "extract_type": str, "url": str, "do_rename": bool},
+    total=False
+)
+DATASET_REGISTRY: Dict[str, List[DatasetComponent]] = {
+    # === LLaVa v1.5 Dataset(s) ===
+    # Note =>> This is the full suite of datasets included in the LLaVa 1.5 "finetuning" stage; all the LLaVa v1.5
+    #          models are finetuned on this split. We use this dataset for all experiments in our paper.
+    "llava-v1.5-instruct":
+    [
+        {
+            "name": "coco/train2017",       # Visual Instruct Tuning images are all sourced from COCO Train 2017
+            "extract": True,
+            "extract_type": "directory",
+            "url": "http://images.cocodataset.org/zips/train2017.zip",
+            "do_rename": True,
+        },
+        {
+            "name": "gqa/images",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip",
+            "do_rename": True,
+        },
+        {
+            "name": "ocr_vqa/images",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://hf-mirror.com/datasets/qnguyen3/ocr_vqa/resolve/main/ocr_vqa.zip",
+            "do_rename": True,
+        },
+        {
+            "name": "textvqa/train_images",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip",
+            "do_rename": True,
+        },
+        {
+            "name": "vg/VG_100K",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip",
+            "do_rename": True,
+        },
+        {
+            "name": "vg/VG_100K_2",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip",
+            "do_rename": True,
+        },
+    ]
+}
+# fmt: on
+def convert_to_jpg(image_dir: Path) -> None:
+    """Handling for OCR-VQA Images specifically; iterates through directory, converts all GIFs/PNGs."""
+    print(f"Converting all Images in `{image_dir}` to JPG")
+    for image_fn in tqdm(list(image_dir.iterdir())):
+        if image_fn.suffix in {".jpg", ".jpeg"} or (jpg_fn := image_dir / f"{image_fn.stem}.jpg").exists():
+            continue
+        if image_fn.suffix == ".gif":
+            gif = Image.open(image_fn)
+            gif.seek(0)
+            gif.convert("RGB").save(jpg_fn)
+        elif image_fn.suffix == ".png":
+            Image.open(image_fn).convert("RGB").save(jpg_fn)
+        else:
+            raise ValueError(f"Unexpected image format `{image_fn.suffix}`")
+def download_with_progress(url: str, download_dir: Path, chunk_size_bytes: int = 1024) -> Path:
+    """Utility function for downloading files from the internet, with a handy Rich-based progress bar."""
+    print(f"Downloading {(dest_path := download_dir / Path(url).name)} from `{url}`", ctx_level=1)
+    if dest_path.exists():
+        return dest_path
+    # Otherwise --> fire an HTTP Request, with `stream = True`
+    response = requests.get(url, stream=True)
+    # Download w/ Transfer-Aware Progress
+    #   => Reference: https://github.com/Textualize/rich/blob/master/examples/downloader.py
+    with Progress(
+        TextColumn("[bold]{task.description} - {task.fields[fname]}"),
+        BarColumn(bar_width=None),
+        "[progress.percentage]{task.percentage:>3.1f}%",
+        "•",
+        DownloadColumn(),
+        "•",
+        TransferSpeedColumn(),
+        transient=True,
+    ) as dl_progress:
+        dl_tid = dl_progress.add_task(
+            "Downloading", fname=dest_path.name, total=int(response.headers.get("content-length", "None"))
+        )
+        with open(dest_path, "wb") as f:
+            for data in response.iter_content(chunk_size=chunk_size_bytes):
+                dl_progress.advance(dl_tid, f.write(data))
+    return dest_path
+def extract_with_progress(archive_path: Path, download_dir: Path, extract_type: str, cleanup: bool = False) -> Path:
+    """Utility function for extracting compressed archives, with a handy Rich-based progress bar."""
+    assert archive_path.suffix == ".zip", "Only `.zip` compressed archives are supported for now!"
+    print(f"Extracting {archive_path.name} to `{download_dir}`", ctx_level=1)
+    # Extract w/ Progress
+    with Progress(
+        TextColumn("[bold]{task.description} - {task.fields[aname]}"),
+        BarColumn(bar_width=None),
+        "[progress.percentage]{task.percentage:>3.1f}%",
+        "•",
+        MofNCompleteColumn(),
+        transient=True,
+    ) as ext_progress:
+        with ZipFile(archive_path) as zf:
+            ext_tid = ext_progress.add_task("Extracting", aname=archive_path.name, total=len(members := zf.infolist()))
+            extract_path = Path(zf.extract(members[0], download_dir))
+            if extract_type == "file":
+                assert len(members) == 1, f"Archive `{archive_path}` with extract type `{extract_type} has > 1 member!"
+            elif extract_type == "directory":
+                for member in members[1:]:
+                    zf.extract(member, download_dir)
+                    ext_progress.advance(ext_tid)
+            else:
+                raise ValueError(f"Extract type `{extract_type}` for archive `{archive_path}` is not defined!")
+    # Cleanup (if specified)
+    if cleanup:
+        archive_path.unlink()
+    return extract_path
+def download_extract(dataset_id: str, root_dir: Path) -> None:
+    """Download all files for a given dataset (querying registry above), extracting archives if necessary."""
+    os.makedirs(download_dir := root_dir / "download" / dataset_id, exist_ok=True)
+    # Download Files => Single-Threaded, with Progress Bar
+    dl_tasks = [d for d in DATASET_REGISTRY[dataset_id] if not (download_dir / d["name"]).exists()]
+    for dl_task in dl_tasks:
+        dl_path = download_with_progress(dl_task["url"], download_dir)
+        # Extract Files (if specified) --> Note (assumes ".zip" ONLY!)
+        if dl_task["extract"]:
+            dl_path = extract_with_progress(dl_path, download_dir, dl_task["extract_type"])
+            dl_path = dl_path.parent if dl_path.is_file() else dl_path
+        # Rename Path --> dl_task["name"]
+        if dl_task["do_rename"]:
+            shutil.move(dl_path, download_dir / dl_task["name"])
+if __name__ == "__main__":
+    import sys
+    from pathlib import Path
+    # 设置根目录
+    root_dir = Path("./data")  # 这里设置一个默认的下载目录
+    os.makedirs(root_dir, exist_ok=True)
+    # 下载所有数据集
+    for dataset_id in DATASET_REGISTRY.keys():
+        print(f"开始下载数据集: {dataset_id}")
+        download_extract(dataset_id, root_dir)
+    print("所有数据集下载完成！")

deepspeed_pretrain.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import os
+from tqdm import tqdm
+import shutil
+os.environ['HF_ENDPOINT']="https://hf-mirror.com"
+from qwenva import tokenizer
+from qwenva import processor
+from qwenva import qwenva
+images_file_path="/root/autodl-tmp/images"
+import torch
+from torch.utils.data import Dataset, DataLoader
+import os
+import json
+from PIL import Image
+import json
+with open('/root/autodl-tmp/chat.json', 'r', encoding='utf-8') as f:
+    chat_data = json.load(f)
+image_token=tokenizer.encode('<image>')[0]
+pad_token=tokenizer.pad_token_id
+image_token=tokenizer.encode('<image>')[0]
+pad_token=tokenizer.pad_token_id
+def process_data(sample,max_len=8012):
+    conversations=sample['conversations']
+    labels=[]
+    input_ids=[]
+    flag=0
+    messages=[]
+    input_ids=[]
+    for index,item in enumerate(conversations):
+        if item['from']=='human':
+            old_input_ids=input_ids
+            messages.append({'role':'user','content':item['value']})
+            input_ids=tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=True
+            )
+            #input_ids+=input_token[]
+            labels+=[-100]*(len(input_ids)-len(old_input_ids))
+            if  index==flag:
+                try:
+                    image_index=input_ids.index(image_token)
+                    labels[image_index]=image_token
+                except ValueError:
+                    print("image token not found")
+                    flag=index+1
+                    continue
+        elif item['from']=='gpt':
+            old_input_ids=input_ids
+            messages.append({'role':'assistant','content':item['value']})
+            input_ids=tokenizer.apply_chat_template(
+               messages
+            )
+            flag=index+1
+            labels+=input_ids[len(old_input_ids):]
+    #填充或者截断，使得长度相同
+    if len(input_ids)>max_len:
+        input_ids=input_ids[:max_len]
+        labels=labels[:max_len]
+        attention_mask=[1]*len(input_ids)
+    else:
+        attention_mask=[1]*len(input_ids)+[0]*(max_len-len(input_ids))
+        input_ids+=[pad_token]*(max_len-len(input_ids))
+        labels+=[-100]*(max_len-len(labels))
+    #转化为张量
+    input_ids=torch.tensor(input_ids)
+    attention_mask=torch.tensor(attention_mask)
+    labels=torch.tensor(labels)
+    image_index=torch.tensor(image_index)
+    return {
+        'input_ids':input_ids,
+        'attention_mask':attention_mask,
+        'labels':labels,
+        'image_idx':image_index
+    }
+import os
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+class MyDataset(Dataset):
+    def __init__(self, images_file_path,data,max_len=1024):
+        self.max_len=max_len
+        self.images_file_path = images_file_path
+        self.data = data
+        self.max_len=max_len
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        output_=process_data(self.data[index],max_len=self.max_len)
+        img_path=os.path.join(self.images_file_path,self.data[index]['image'])
+        img=Image.open(img_path)
+        input_pixel= processor(images=img, return_tensors="pt")
+        output_['input_pixel']=input_pixel['pixel_values'].squeeze()
+        return output_
+dataset=MyDataset(images_file_path,chat_data,max_len=360)
+train_loader=DataLoader(dataset,batch_size=8,shuffle=True)
+import deepspeed
+import argparse
+# 设置设备
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+qwenva=qwenva.to(device)
+model_engine,optimizer,_,_=deepspeed.initialize(
+    model=qwenva,
+    args=argparse.Namespace(),
+    model_parameters=qwenva.parameters(),
+    config_params="./deepspeed_config.json"
+)
+#checkpoint_path = "/root/autodl-tmp/best_model_2"
+#model_engine.load_checkpoint(checkpoint_path)
+import torch.optim as optim
+import torch.nn as nn
+from torch.amp import autocast, GradScaler
+#optimizer = optim.Adam(model.parameters(), lr=0.001)
+loss_fn = nn.CrossEntropyLoss()
+#eps = 1e-8
+accumulation_steps = 2
+# 训练函数
+def train(model_engine, train_dataloader, optimizer, loss_fn, device, epochs):
+    model_engine.train()
+    #model_engine.to(device)
+    for epoch in range(epochs):
+        # 使用 tqdm 显示进度条
+        with tqdm(total=len(train_dataloader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar:
+            optimizer.zero_grad()
+            for batch_idx, batch in enumerate(train_dataloader):
+                # 将数据拷贝到 GPU 上
+                input_ids = batch['input_ids'].to(device)
+                attention_mask = batch['attention_mask'].to(device)
+                input_pixel = batch['input_pixel'].to(device)
+                labels = batch['labels'].to(device)
+                image_idx=batch['image_idx'].to(device)
+                logits = model_engine(input_ids, attention_mask, input_pixel,image_idx)
+                # 计算损失
+                #max_logits= logits.max(dim=-1, keepdim=True)[0]  # 计算最大值
+                #stable_logits= logits - max_logits  # 减去最大值得到数值稳定的值
+                loss= loss_fn(logits[:, :-1, :].reshape(-1, logits.shape[-1]), labels[:, 1:].reshape(-1).clone())
+                # 反向传播
+                model_engine.backward(loss)
+                if (batch_idx+1)%accumulation_steps==0:
+                    model_engine.step()
+                pbar.update(1)
+                pbar.set_postfix(loss=loss.item())  # 显示当前损失
+                if (batch_idx+1)%24807==0:
+                        # 如果文件夹存在，则删除并重新创建
+                        if os.path.exists("/root/autodl-tmp/best_model_instruct"):
+                            shutil.rmtree("/root/autodl-tmp/best_model_instruct")  # 删除文件夹及其内容
+                            os.makedirs("/root/autodl-tmp/best_model_instruct")  # 重新创建文件夹
+                        model_engine.save_checkpoint("/root/autodl-tmp/best_model_instruct")
+                        print(f" model saved at batch {batch_idx+1}")
+train(model_engine, train_loader, optimizer, loss_fn, device, epochs=1)

deepspeed_train_150k.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import os
+import deepspeed
+#deepspeed.initialize(config="./deepspeed_config.json",log_level='DEBUG')
+from tqdm import tqdm
+import shutil
+os.environ['HF_ENDPOINT']="https://hf-mirror.com"
+from qwenva import tokenizer
+from qwenva import processor
+from qwenva import qwenva
+images_file_path='./data/download/llava-v1.5-instruct/coco/train2017'
+import torch
+from torch.utils.data import Dataset, DataLoader
+import os
+import json
+from PIL import Image
+import json
+with open('/root/autodl-tmp/LLaVA-Instruct-150K/llava_instruct_150k.json', 'r', encoding='utf-8') as f:
+    chat_data = json.load(f)
+import torch
+image_token=tokenizer.encode('<image>')[0]
+pad_token=tokenizer.pad_token_id
+image_token=tokenizer.encode('<image>')[0]
+pad_token=tokenizer.pad_token_id
+def process_data(sample,max_len=8012):
+    conversations=sample['conversations']
+    labels=[]
+    input_ids=[]
+    flag=0
+    messages=[]
+    input_ids=[]
+    try:
+        for index,item in enumerate(conversations):
+            if item['from']=='human':
+                old_input_ids=input_ids
+                messages.append({'role':'user','content':item['value']})
+                input_ids=tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=True
+            )
+                #input_ids+=input_token[]
+                labels+=[-100]*(len(input_ids)-len(old_input_ids))
+                if  index==flag:
+                    try:
+                        image_index=input_ids.index(image_token)
+                        labels[image_index]=image_token
+                    except ValueError:
+                        print("image token not found")
+                        flag=index+1
+                        continue
+            elif item['from']=='gpt':
+                old_input_ids=input_ids
+                messages.append({'role':'assistant','content':item['value']})
+                input_ids=tokenizer.apply_chat_template(
+               messages
+            )
+                labels+=input_ids[len(old_input_ids):]
+    except:
+        print("error in process_data_1")
+        exit()
+    #填充或者截断，使得长度相同
+    try:
+        if len(input_ids)>max_len:
+            input_ids=input_ids[:max_len]
+            labels=labels[:max_len]
+            attention_mask=[1]*len(input_ids)
+        else:
+            attention_mask=[1]*len(input_ids)+[0]*(max_len-len(input_ids))
+            input_ids+=[pad_token]*(max_len-len(input_ids))
+            labels+=[-100]*(max_len-len(labels))
+    except:
+        print("error in process_data_2")
+        exit()
+    #转化为张量
+    try:
+        input_ids=torch.tensor(input_ids)
+        attention_mask=torch.tensor(attention_mask)
+        labels=torch.tensor(labels)
+        image_index=torch.tensor(image_index)
+    except:
+        print("error in tensor")
+        exit()
+    return {
+        'input_ids':input_ids,
+        'attention_mask':attention_mask,
+        'labels':labels,
+        'image_idx':image_index
+    }
+import os
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+class MyDataset(Dataset):
+    def __init__(self, images_file_path,data,max_len=1024):
+        self.max_len=max_len
+        self.images_file_path = images_file_path
+        self.data = data
+        self.max_len=max_len
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        output_=process_data(self.data[index],max_len=self.max_len)
+        img_path=os.path.join(self.images_file_path,self.data[index]['image'])
+        try:
+            img=Image.open(img_path)
+        except:
+            print(f"image {img_path} not found")
+            output_['labels']=torch.tensor([-100]*self.max_len)
+        input_pixel= processor(images=img, return_tensors="pt")
+        output_['input_pixel']=input_pixel['pixel_values'].squeeze()
+        return output_
+dataset=MyDataset(images_file_path,chat_data,max_len=2048)
+train_loader=DataLoader(dataset,batch_size=8,shuffle=True)
+import argparse
+# 设置设备
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+qwenva=qwenva.to(device)
+model_engine,optimizer,_,_=deepspeed.initialize(
+    model=qwenva,
+    args=argparse.Namespace(),
+    model_parameters=qwenva.parameters(),
+    config_params="./deepspeed_config.json"
+)
+#checkpoint_path = "./best_model_2"
+#model_engine.load_checkpoint(checkpoint_path)
+#保存编译模型权重
+#torch.save(model_engine.module.state_dict(), "./compiled_model.pth")
+for name, param in model_engine.module._orig_mod.text_embedding.named_parameters():
+            param.requires_grad = True
+            #print("embedding权重梯度打开:",name)
+#for name, param in model_engine.module._orig_mod.align_layer.named_parameters():
+            #param.requires_grad = True
+            #print("align_layer权重梯度打开",name)
+for name,param in model_engine.module._orig_mod.lm_head.named_parameters():
+            param.requires_grad = True
+            #print("lm_head权重梯度打开",name)
+for  name,param in model_engine.module._orig_mod.transformer.named_parameters():
+            param.requires_grad = True
+            #print("transformer权重梯度打开",name)
+for name,param in model_engine.module._orig_mod.named_parameters():
+     if param.requires_grad:
+        print(f"Layer: {name}, Requires Grad: {param.requires_grad}")
+#optimizer = optim.Adam(model.parameters(), lr=0.001)
+import torch.nn as nn
+loss_fn = nn.CrossEntropyLoss()
+#eps = 1e-8
+accumulation_steps = 1
+# 训练函数
+def train(model_engine, train_dataloader, loss_fn, device, epochs):
+    model_engine.train()
+    #model_engine.to(device)
+    for epoch in range(epochs):
+        # 使用 tqdm 显示进度条
+        with tqdm(total=len(train_dataloader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar:
+            #optimizer.zero_grad()
+            try:
+                for batch_idx, batch in enumerate(train_dataloader):
+                    # 将数据拷贝到 GPU 上
+                    input_ids = batch['input_ids'].to(device)
+                    attention_mask = batch['attention_mask'].to(device)
+                    input_pixel = batch['input_pixel'].to(device)
+                    labels = batch['labels'].to(device)
+                    image_idx=batch['image_idx'].to(device)
+                    logits = model_engine(input_ids, attention_mask, input_pixel,image_idx)
+                    # 计算损失
+                    max_logits= logits.max(dim=-1, keepdim=True)[0]  # 计算最大值
+                    stable_logits= logits - max_logits  # 减去最大值得到数值稳定的值
+                    loss= loss_fn(stable_logits[:, :-1, :].reshape(-1, stable_logits.shape[-1]), labels[:, 1:].reshape(-1).clone())
+                    model_engine.backward(loss)
+                    if (batch_idx+1)%accumulation_steps==0:
+                        model_engine.step()
+                    pbar.update(1)
+                    pbar.set_postfix(loss=loss.item())  # 显示当前损失
+                    if (batch_idx+1)%6000==0:
+                        # 如果文件夹存在，则删除并重新创建
+                        if os.path.exists("./best_model_2"):
+                            shutil.rmtree("./best_model_2")  # 删除文件夹及其内容
+                            os.makedirs("./best_model_2")  # 重新创建文件夹
+                        model_engine.save_checkpoint("./best_model_2")
+                        torch.save(model_engine.module.state_dict(), "./compiled_model_2.pth")
+                        print(f" model saved at batch {batch_idx+1}")
+            except Exception as e:
+                print(f"error in train {e}")
+train(model_engine, train_loader,  loss_fn, device, epochs=2)

deepspeed_train_665k.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+import deepspeed
+#deepspeed.initialize(config="./deepspeed_config.json",log_level='DEBUG')
+from tqdm import tqdm
+import shutil
+os.environ['HF_ENDPOINT']="https://hf-mirror.com"
+from qwenva import tokenizer
+from qwenva import processor
+from qwenva import qwenva
+images_file_path='./data/download/llava-v1.5-instruct'
+import torch
+from torch.utils.data import Dataset, DataLoader
+import os
+import json
+from PIL import Image
+import json
+with open('/root/autodl-tmp/LLaVA-Instruct-150K/qwenva_mix665k.json', 'r', encoding='utf-8') as f:
+    chat_data = json.load(f)
+import torch
+image_token=tokenizer.encode('<image>')[0]
+pad_token=tokenizer.pad_token_id
+image_token=tokenizer.encode('<image>')[0]
+pad_token=tokenizer.pad_token_id
+def process_data(sample,max_len=8012):
+    conversations=sample['conversations']
+    labels=[]
+    input_ids=[]
+    flag=0
+    messages=[]
+    input_ids=[]
+    try:
+        for index,item in enumerate(conversations):
+            if item['from']=='human':
+                old_input_ids=input_ids
+                messages.append({'role':'user','content':item['value']})
+                input_ids=tokenizer.apply_chat_template(
+                messages,
+                add_generation_prompt=True
+            )
+                #input_ids+=input_token[]
+                labels+=[-100]*(len(input_ids)-len(old_input_ids))
+                if  index==flag:
+                        if image_token in input_ids:
+                             image_index=input_ids.index(image_token)
+                             labels[image_index]=image_token
+                        else:
+                             image_index=-100
+            elif item['from']=='gpt':
+                old_input_ids=input_ids
+                messages.append({'role':'assistant','content':item['value']})
+                input_ids=tokenizer.apply_chat_template(
+               messages
+            )
+                labels+=input_ids[len(old_input_ids):]
+    except:
+        print("error in process_data_1")
+        exit()
+    #填充或者截断，使得长度相同
+    try:
+        if len(input_ids)>max_len:
+            input_ids=input_ids[:max_len]
+            labels=labels[:max_len]
+            attention_mask=[1]*len(input_ids)
+        else:
+            attention_mask=[1]*len(input_ids)+[0]*(max_len-len(input_ids))
+            input_ids+=[pad_token]*(max_len-len(input_ids))
+            labels+=[-100]*(max_len-len(labels))
+    except:
+        print("error in process_data_2")
+        exit()
+    #转化为张量
+    try:
+        input_ids=torch.tensor(input_ids)
+        attention_mask=torch.tensor(attention_mask)
+        labels=torch.tensor(labels)
+        image_index=torch.tensor(image_index)
+    except:
+        print("error in tensor")
+        exit()
+    return {
+        'input_ids':input_ids,
+        'attention_mask':attention_mask,
+        'labels':labels,
+        'image_idx':image_index
+    }
+import os
+import torch
+from torch.utils.data import Dataset
+from PIL import Image
+class MyDataset(Dataset):
+    def __init__(self, images_file_path,data,max_len=1024):
+        self.max_len=max_len
+        self.images_file_path = images_file_path
+        self.data = data
+        self.max_len=max_len
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, index):
+        output_=process_data(self.data[index],max_len=self.max_len)
+        if output_['image_idx']!=-100:
+             img_path=os.path.join(self.images_file_path,self.data[index]['image'])
+             img=Image.open(img_path)
+             input_pixel= processor(images=img, return_tensors="pt")
+             output_['input_pixel']=input_pixel['pixel_values'].squeeze()
+        else:
+             output_['input_pixel']=torch.zeros(3,224,224).to(device=output_['input_ids'].device,dtype=output_['input_ids'].dtype)
+        return output_
+dataset=MyDataset(images_file_path,chat_data,max_len=2048)
+train_loader=DataLoader(dataset,batch_size=16,shuffle=True)
+import argparse
+model_engine,optimizer,_,_=deepspeed.initialize(
+    model=qwenva,
+    args=argparse.Namespace(),
+    model_parameters=qwenva.parameters(),
+    config_params="./deepspeed_config.json"
+)
+#checkpoint_path = "./best_model_2"
+#model_engine.load_checkpoint(checkpoint_path)
+#保存编译模型权重
+#torch.save(model_engine.module.state_dict(), "./compiled_model.pth")
+for name, param in model_engine.module._orig_mod.text_embedding.named_parameters():
+            param.requires_grad = True
+            #print("embedding权重梯度打开:",name)
+#for name, param in model_engine.module._orig_mod.align_layer.named_parameters():
+            #param.requires_grad = True
+            #print("align_layer权重梯度打开",name)
+for name,param in model_engine.module._orig_mod.lm_head.named_parameters():
+            param.requires_grad = True
+            #print("lm_head权重梯度打开",name)
+for  name,param in model_engine.module._orig_mod.transformer.named_parameters():
+            param.requires_grad = True
+            #print("transformer权重梯度打开",name)
+for name,param in model_engine.module._orig_mod.named_parameters():
+     if param.requires_grad:
+        print(f"Layer: {name}, Requires Grad: {param.requires_grad}")
+#optimizer = optim.Adam(model.parameters(), lr=0.001)
+import torch.nn as nn
+loss_fn = nn.CrossEntropyLoss()
+#eps = 1e-8
+accumulation_steps = 1
+# 训练函数
+def train(model_engine, train_dataloader, loss_fn, device, epochs):
+    model_engine.train()
+    #model_engine.to(device)
+    for epoch in range(epochs):
+        # 使用 tqdm 显示进度条
+        with tqdm(total=len(train_dataloader), desc=f'Epoch {epoch + 1}/{epochs}', unit='batch') as pbar:
+            #optimizer.zero_grad()
+            try:
+                for batch_idx, batch in enumerate(train_dataloader):
+                    # 将数据拷贝到 GPU 上
+                    input_ids = batch['input_ids'].to(device)
+                    attention_mask = batch['attention_mask'].to(device)
+                    input_pixel = batch['input_pixel'].to(device)
+                    labels = batch['labels'].to(device)
+                    image_idx=batch['image_idx'].to(device)
+                    logits = model_engine(input_ids, attention_mask, input_pixel,image_idx)
+                    # 计算损失
+                    #max_logits= logits.max(dim=-1, keepdim=True)[0]  # 计算最大值
+                    #stable_logits= logits - max_logits  # 减去最大值得到数值稳定的值
+                    loss= loss_fn(logits[:, :-1, :].reshape(-1, logits.shape[-1]), labels[:, 1:].reshape(-1).clone())
+                    model_engine.backward(loss)
+                    if (batch_idx+1)%accumulation_steps==0:
+                        model_engine.step()
+                    pbar.update(1)
+                    pbar.set_postfix(loss=loss.item())  # 显示当前损失
+                    if (batch_idx+1)%4100==0:
+                        # 如果文件夹存在，则删除并重新创建
+                        if os.path.exists("./best_model_2"):
+                            shutil.rmtree("./best_model_2")  # 删除文件夹及其内容
+                            os.makedirs("./best_model_2")  # 重新创建文件夹
+                        model_engine.save_checkpoint("./best_model_2")
+                        torch.save(model_engine.module.state_dict(), "./compiled_model_3.pth")
+                        print(f" model saved at batch {batch_idx+1}")
+            except Exception as e:
+                print(f"error in train {e}")
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+train(model_engine, train_loader,  loss_fn, device, epochs=2)

download2.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import os
+import shutil
+from pathlib import Path
+from typing import Dict, List, TypedDict
+from zipfile import ZipFile
+import requests
+from PIL import Image
+from rich.progress import BarColumn, DownloadColumn, MofNCompleteColumn, Progress, TextColumn, TransferSpeedColumn
+from tqdm import tqdm
+"""
+        {
+            "name": "coco/train2017",       # Visual Instruct Tuning images are all sourced from COCO Train 2017
+            "extract": True,
+            "extract_type": "directory",
+            "url": "http://images.cocodataset.org/zips/train2017.zip",
+            "do_rename": True,
+        },
+         {
+            "name": "gqa/images",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://downloads.cs.stanford.edu/nlp/data/gqa/images.zip",
+            "do_rename": True,
+        },
+         {
+            "name": "ocr_vqa/images",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://hf-mirror.com/datasets/qnguyen3/ocr_vqa/resolve/main/ocr_vqa.zip",
+            "do_rename": True,
+        },
+        {
+            "name": "textvqa/train_images",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://dl.fbaipublicfiles.com/textvqa/images/train_val_images.zip",
+            "do_rename": True,
+        },
+             {
+            "name": "vg/VG_100K_2",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://cs.stanford.edu/people/rak248/VG_100K_2/images2.zip",
+            "do_rename": True,
+        },
+"""
+# === Dataset Registry w/ Links ===
+# fmt: off
+DatasetComponent = TypedDict(
+    "DatasetComponent",
+    {"name": str, "extract": bool, "extract_type": str, "url": str, "do_rename": bool},
+    total=False
+)
+DATASET_REGISTRY: Dict[str, List[DatasetComponent]] = {
+    # === LLaVa v1.5 Dataset(s) ===
+     "llava-v1.5-instruct":[
+        {
+            "name": "vg/VG_100K",
+            "extract": True,
+            "extract_type": "directory",
+            "url": "https://cs.stanford.edu/people/rak248/VG_100K_2/images.zip",
+            "do_rename": True,
+        }
+    ]
+}
+# fmt: on
+def convert_to_jpg(image_dir: Path) -> None:
+    """Handling for OCR-VQA Images specifically; iterates through directory, converts all GIFs/PNGs."""
+    print(f"Converting all Images in `{image_dir}` to JPG")
+    for image_fn in tqdm(list(image_dir.iterdir())):
+        jpg_fn = image_dir / f"{image_fn.stem}.jpg"  # 创建 JPG 文件名
+        if image_fn.suffix in {".jpg", ".jpeg"} or jpg_fn.exists():
+            continue
+        if image_fn.suffix == ".gif":
+            gif = Image.open(image_fn)
+            gif.seek(0)
+            gif.convert("RGB").save(jpg_fn)
+        elif image_fn.suffix == ".png":
+            Image.open(image_fn).convert("RGB").save(jpg_fn)
+        else:
+            raise ValueError(f"Unexpected image format `{image_fn.suffix}`")
+import os
+import shutil
+from pathlib import Path
+from typing import Dict, List, TypedDict
+from zipfile import ZipFile
+import requests
+from PIL import Image
+from rich.progress import BarColumn, DownloadColumn, MofNCompleteColumn, Progress, TextColumn, TransferSpeedColumn
+from tqdm import tqdm
+# DatasetComponent 和 DATASET_REGISTRY 保持不变
+def download_with_progress(url: str, download_dir: Path, chunk_size_bytes: int = 1024) -> Path:
+    """Utility function for downloading files from the internet, with a handy Rich-based progress bar."""
+    print(f"Downloading {url}")
+    dest_path = download_dir / Path(url).name
+    resume_header = {}
+    if dest_path.exists():
+        return dest_path
+    max_retries = 5
+    for attempt in range(max_retries):
+        try:
+            response = requests.get(url, headers=resume_header, stream=True)
+            if response.status_code not in (200, 206):
+                raise Exception(f"Failed to download. Status code: {response.status_code}")
+            # 下载进度条
+            with Progress(
+                TextColumn("[bold]{task.description} - {task.fields[fname]}"),
+                BarColumn(bar_width=None),
+                "[progress.percentage]{task.percentage:>3.1f}%",
+                "•",
+                DownloadColumn(),
+                "•",
+                TransferSpeedColumn(),
+                transient=True,
+            ) as dl_progress:
+                dl_tid = dl_progress.add_task(
+                    "Downloading", fname=dest_path.name, total=int(response.headers.get("content-length", "None"))
+                )
+                with open(dest_path, "ab") as f:  # 以二进制追加模式打开文件
+                    for data in response.iter_content(chunk_size=chunk_size_bytes):
+                        f.write(data)
+                        dl_progress.advance(dl_tid, chunk_size_bytes)
+            return dest_path
+        except Exception as e:
+            print(f"Attempt {attempt + 1}/{max_retries} failed: {e}")
+            if attempt < max_retries - 1:
+                print("Retrying...")
+            else:
+                raise
+# 其他函数保持不变，main 方法也不变
+def extract_with_progress(archive_path: Path, download_dir: Path, extract_type: str, cleanup: bool = False) -> Path:
+    """Utility function for extracting compressed archives, with a handy Rich-based progress bar."""
+    assert archive_path.suffix == ".zip", "Only `.zip` compressed archives are supported for now!"
+    print(f"Extracting {archive_path.name} to `{download_dir}`")
+    with Progress(
+        TextColumn("[bold]{task.description} - {task.fields[aname]}"),
+        BarColumn(bar_width=None),
+        "[progress.percentage]{task.percentage:>3.1f}%",
+        "•",
+        MofNCompleteColumn(),
+        transient=True,
+    ) as ext_progress:
+        with ZipFile(archive_path) as zf:
+            ext_tid = ext_progress.add_task("Extracting", aname=archive_path.name, total=len(members := zf.infolist()))
+            extract_path = Path(zf.extract(members[0], download_dir))
+            if extract_type == "file":
+                assert len(members) == 1, f"Archive `{archive_path}` with extract type `{extract_type} has > 1 member!"
+            elif extract_type == "directory":
+                for member in members[1:]:
+                    zf.extract(member, download_dir)
+                    ext_progress.advance(ext_tid)
+            else:
+                raise ValueError(f"Extract type `{extract_type}` for archive `{archive_path}` is not defined!")
+    if cleanup:
+        archive_path.unlink()
+    return extract_path
+def download_extract(dataset_id: str, root_dir: Path) -> None:
+    """Download all files for a given dataset (querying registry above), extracting archives if necessary."""
+    os.makedirs(download_dir := root_dir / "download" / dataset_id, exist_ok=True)
+    # Download Files
+    dl_tasks = [d for d in DATASET_REGISTRY[dataset_id] if not (download_dir / d["name"]).exists()]
+    for dl_task in dl_tasks:
+        dl_path = download_with_progress(dl_task["url"], download_dir)
+        if dl_task["extract"]:
+            dl_path = extract_with_progress(dl_path, download_dir, dl_task["extract_type"])
+            dl_path = dl_path.parent if dl_path.is_file() else dl_path
+        if dl_task["do_rename"]:
+            shutil.move(dl_path, download_dir / dl_task["name"])
+if __name__ == "__main__":
+    import sys
+    from pathlib import Path
+    # 设置根目录
+    root_dir = Path("./data")  # 这里设置一个默认的下载目录
+    os.makedirs(root_dir, exist_ok=True)
+    # 下载所有数据集
+    for dataset_id in DATASET_REGISTRY.keys():
+        print(f"开始下载数据集: {dataset_id}")
+        download_extract(dataset_id, root_dir)
+    print("所有数据集下载完成！")