Grant commited on
Commit
62de390
1 Parent(s): de333ca
.gitattributes CHANGED
@@ -35,5 +35,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  ALL_hum_isoforms_ESM1b_dels.zip filter=lfs diff=lfs merge=lfs -text
37
  del_sub_data.csv.gz filter=lfs diff=lfs merge=lfs -text
38
- rand_samp_gw_del_sub.csv.gz filter=lfs diff=lfs merge=lfs -text
39
  ALL_hum_isoforms_ESM1b_del_sub.zip filter=lfs diff=lfs merge=lfs -text
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  ALL_hum_isoforms_ESM1b_dels.zip filter=lfs diff=lfs merge=lfs -text
37
  del_sub_data.csv.gz filter=lfs diff=lfs merge=lfs -text
 
38
  ALL_hum_isoforms_ESM1b_del_sub.zip filter=lfs diff=lfs merge=lfs -text
.ipynb_checkpoints/process_data-checkpoint.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import zipfile
3
+ import logging
4
+
5
+ logging.basicConfig(level=logging.INFO,
6
+ format='%(asctime)s - %(levelname)s - %(message)s',
7
+ filename='processing.log')
8
+
9
+ df = pd.read_csv("del_sub_data.csv.gz")
10
+
11
+ prots = list(df.prot.unique())
12
+
13
+ def write_chunk_to_zip(sub_df, zf, uid):
14
+ with zf.open(f"{uid}.csv", "w") as f:
15
+ sub_df.to_csv(f, index=False)
16
+
17
+ with zipfile.ZipFile("ALL_hum_isoforms_ESM1b_del_sub.zip", "w") as zip_file:
18
+ # Iterate over chunks of the DataFrame
19
+ for p in prots:
20
+ print(p)
21
+ prot_data = df[df.prot == p]
22
+ write_chunk_to_zip(prot_data, zip_file, p)
23
+ logging.info(f"Finished {p}")
24
+
25
+ logging.info("Finished all")
ALL_hum_isoforms_ESM1b_del_sub.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f36e5f30fce46d55545061d3b4ad53b608a4cc8915a80ce051067f22f38fe06
3
+ size 1020611720
process_data.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import zipfile
3
+ import logging
4
+
5
+ logging.basicConfig(level=logging.INFO,
6
+ format='%(asctime)s - %(levelname)s - %(message)s',
7
+ filename='processing.log')
8
+
9
+ df = pd.read_csv("del_sub_data.csv.gz")
10
+
11
+ prots = list(df.prot.unique())
12
+
13
+ def write_chunk_to_zip(sub_df, zf, uid):
14
+ with zf.open(f"{uid}.csv", "w") as f:
15
+ sub_df.to_csv(f, index=False)
16
+
17
+ with zipfile.ZipFile("ALL_hum_isoforms_ESM1b_del_sub.zip", "w") as zip_file:
18
+ # Iterate over chunks of the DataFrame
19
+ for p in prots:
20
+ print(p)
21
+ prot_data = df[df.prot == p]
22
+ write_chunk_to_zip(prot_data, zip_file, p)
23
+ logging.info(f"Finished {p}")
24
+
25
+ logging.info("Finished all")
processing.log ADDED
The diff for this file is too large to render. See raw diff
 
rand_samp_gw_del_sub.csv.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21057b72026d1e112137a3e1e1390ed00b547e725fe4917e8549d99984196677
3
+ size 417309