Add pyt model from Kipo examples

Browse files

Source revision:
https://github.com/kipoi/kipoi/tree/6b5460c1cd1ba9667c23b7cb029640116147646b/example/models/pyt

Files changed (14) hide show

.gitattributes +1 -0
dataloader.py +77 -0
dataloader.yaml +45 -0
example_files/hg38_chr22.fa +3 -0
example_files/hg38_chr22.fa.fai +1 -0
example_files/intervals.tsv +14 -0
example_files/test.json +3 -0
expected.pred.h5 +3 -0
model.yaml +32 -0
model_files/full_model.pth +3 -0
model_files/only_weights.pth +3 -0
model_files/pyt.py +75 -0
pyt.py +1 -0
wrong.pred.h5 +3 -0

.gitattributes CHANGED Viewed

@@ -25,3 +25,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zstandard filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.fa filter=lfs diff=lfs merge=lfs -text

dataloader.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""DeepSEA dataloader
+"""
+import numpy as np
+import pandas as pd
+import pybedtools
+from pybedtools import BedTool
+from kipoi.data import Dataset
+from kipoi.metadata import GenomicRanges
+from kipoiseq.extractors import FastaStringExtractor
+import linecache
+from kipoiseq.transforms.functional import one_hot_dna
+# --------------------------------------------
+class BedToolLinecache(BedTool):
+    """Fast BedTool accessor by Ziga Avsec
+    Normal BedTools loops through the whole file to get the
+    line of interest. Hence the access it o(n)
+    """
+    def __getitem__(self, idx):
+        line = linecache.getline(self.fn, idx + 1)
+        return pybedtools.create_interval_from_list(line.strip().split("\t"))
+class SeqDataset(Dataset):
+    """
+    Args:
+        intervals_file: bed3 file containing intervals
+        fasta_file: file path; Genome sequence
+        target_file: file path; path to the targets in the csv format
+    """
+    def __init__(self, intervals_file, fasta_file, target_file=None, use_linecache=False):
+        # intervals
+        if use_linecache:
+            self.bt = BedToolLinecache(intervals_file)
+        else:
+            self.bt = BedTool(intervals_file)
+        self.fasta_file = fasta_file
+        self.fasta_extractor = None
+        # Targets
+        if target_file is not None:
+            self.targets = pd.read_csv(target_file)
+        else:
+            self.targets = None
+    def __len__(self):
+        return len(self.bt)
+    def __getitem__(self, idx):
+        if self.fasta_extractor is None:
+            self.fasta_extractor = FastaStringExtractor(self.fasta_file)
+        interval = self.bt[idx]
+        # Intervals need to be 1000bp wide
+        assert interval.stop - interval.start == 1000
+        if self.targets is not None:
+            y = self.targets.iloc[idx].values
+        else:
+            y = {}
+        # Run the fasta extractor
+        seq = one_hot_dna(self.fasta_extractor.extract(interval), dtype=np.float32) # TODO: Remove additional dtype after kipoiseq gets a new release
+        return {
+            "inputs": seq,
+            "targets": y,
+            "metadata": {
+                "ranges": GenomicRanges.from_interval(interval)
+            }
+        }

dataloader.yaml ADDED Viewed

	@@ -0,0 +1,45 @@

+defined_as: dataloader.SeqDataset
+args:
+  intervals_file:
+    doc: bed3 file with `chrom start end id score strand`
+    example: example_files/intervals.tsv
+  fasta_file:
+    doc: Reference genome sequence
+    example: example_files/hg38_chr22.fa
+  target_file:
+    doc: path to the targets (.tsv) file
+    optional: True
+  use_linecache:
+    doc: if True, use linecache https://docs.python.org/3/library/linecache.html to access bed file rows
+    optional: True
+info:
+  authors:
+      - name: Lara Urban
+        github: LaraUrban
+      - name: Ziga Avsec
+        github: avsecz
+  doc: Dataloader for the DeepSEA model.
+dependencies:
+  conda:
+    - python
+    - numpy
+    - pandas
+    - cython
+  pip:
+    - cython
+    - pybedtools
+output_schema:
+  inputs:
+    name: input
+    shape: (1000, 4)
+    special_type: DNASeq
+    doc: DNA sequence
+    associated_metadata: ranges
+  targets:
+    name: epigen_mod
+    shape: (1, )
+    doc: Specific epigentic feature class (multi-task binary classification)
+  metadata:
+    ranges:
+      type: GenomicRanges
+      doc: Ranges describing inputs.seq

example_files/hg38_chr22.fa ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:03397edb6bea565057a4f6f643daaec4399d8a8429eec15ee1e2845fad850fe6
+size 50818476

example_files/hg38_chr22.fa.fai ADDED Viewed

	@@ -0,0 +1 @@


1	+ chr22 50818468 7 50818468 50818469

example_files/intervals.tsv ADDED Viewed

	@@ -0,0 +1,14 @@

+chr22	4997	5997	1	0	+
+chr22	5330	6330	2	0	-
+chr22	6728	7728	3	0	-
+chr22	3482	4482	4	0	+
+chr22	7989	8989	5	0	+
+chr22	8136	9136	6	0	+
+chr22	3617	4617	7	0	-
+chr22	7887	8887	8	0	+
+chr22	8428	9428	9	0	+
+chr22	9444	10444	10	0	+
+chr22	41	1041	11	0	+
+chr22	5777	6777	12	0	+
+chr22	7084	8084	13	0	+
+chr22	5725	6725	14	0	+

example_files/test.json ADDED Viewed

	@@ -0,0 +1,3 @@

+{"intervals_file": "intervals.tsv",
+ "fasta_file": "hg38_chr22.fa"
+ }

expected.pred.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:27ec18f672e5afc40381b6874c885b68f7c7ee805064d926e1dd91614bc66590
+size 183002

model.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+defined_as: kipoi.model.PyTorchModel
+args:
+    module_file: model_files/pyt.py
+    module_obj: simple_model
+    weights: model_files/only_weights.pth
+default_dataloader: . # path to the directory
+info:
+    authors:
+        - name: Roman Kreuzhuber
+          github: krrome
+    doc: Simple testing model for pytorch
+dependencies:
+    conda:
+        - pytorch::pytorch>=0.2.0
+schema:
+    inputs:
+        name: input
+        shape: (1000, 4)
+        special_type: DNASeq
+        doc: DNA sequence
+        # associated_metadata: ranges # --> has to be defined in dataloader.yaml.
+        #                                   This field is ignored in model.yaml.
+    targets:
+        shape: (1, )
+        doc: Predicted binding strength
+        column_labels:
+            - some_probability
+test:
+     expect:
+       url: https://s3.eu-central-1.amazonaws.com/kipoi-models/predictions/example/models/pyt/expected.pred.h5
+       md5: d6d0779a7bdfb1301c76a59defd293ed
+     precision_decimal: 6

model_files/full_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fb7034af1f15bf0dc242a41645b3bc781486964818fb6710f1fc78e1ca34b12b
+size 1607392

model_files/only_weights.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6b963f8eb38ae2ceab5a24d08a437f12b8fa94cb2cd5046be0475712e24d28ed
+size 1601416

model_files/pyt.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import torch
+import numpy as np
+from tqdm import tqdm
+class Flatten(torch.nn.Module):
+    # https://gist.github.com/VoVAllen/5531c78a2d3f1ff3df772038bca37a83
+    def __init__(self):
+        super(Flatten, self).__init__()
+    def forward(self, x):
+        return x.view(x.size(0), -1)
+def get_model():
+    # N is batch size; D_in is input dimension;
+    # H is hidden dimension; D_out is output dimension.
+    D_in, H, D_out = 4000, 100, 1
+    model = torch.nn.Sequential(
+        Flatten(),
+        torch.nn.Linear(D_in, H),
+        torch.nn.ReLU(),
+        torch.nn.Linear(H, D_out),
+        torch.nn.Sigmoid(),
+    )
+    return model
+simple_model = get_model()
+def generate_exmaple_model():
+    # get model
+    model = get_model()
+    # define loss function
+    loss_func = torch.nn.MSELoss()
+    # define optimizer
+    optimizer = torch.optim.SGD(model.parameters(), lr=1e-1)
+    minibatch_size = 10
+    np.random.seed(0)
+    x = torch.Tensor(50, 1000, 4).uniform_(0, 1)
+    y = torch.Tensor(50).uniform_(0, 1)
+    for epoch in tqdm(range(10)):
+        for mbi in tqdm(range(np.ceil(x.size()[0] / minibatch_size).astype(int))):
+            minibatch = x[(mbi * minibatch_size):min(((mbi + 1) * minibatch_size), x.size()[0])]
+            target = torch.autograd.Variable(y[(mbi * minibatch_size):min(((mbi + 1) * minibatch_size), x.size()[0])])
+            model.zero_grad()
+            # forward pass
+            out = model(torch.autograd.Variable(minibatch))
+            # backward pass
+            L = loss_func(out, target)  # calculate loss
+            L.backward()  # calculate gradients
+            optimizer.step()  # make an update step
+    torch.save(model, "model_files/full_model.pth")
+    torch.save(model.state_dict(), "model_files/only_weights.pth")
+## To comply with OldPyTorchModel
+def get_model_w_weights():
+    model = get_model()
+    model.load_state_dict(torch.load("model_files/only_weights.pth"))
+    return model
+def test_same_weights(dict1, dict2):
+    for k in dict1:
+        assert np.all(dict1[k].numpy() == dict2[k].numpy())
+# test_same_weights(model.state_dict(), model_2.state_dict())

pyt.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ model_files/pyt.py

wrong.pred.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:178dccd292a4f787012e70dc91fc9377f85ec88637484802900480654223ec7e
+size 183002