Spaces:
Runtime error
Runtime error
Nicky Nicolson
commited on
Commit
·
424fd94
1
Parent(s):
0c0ffb8
MOved create cols into format conv script
Browse files- Dockerfile +3 -3
- tab2csv.py +13 -0
Dockerfile
CHANGED
@@ -17,9 +17,9 @@ RUN ls -l /data
|
|
17 |
RUN unzip /data/gbif-occs.zip -d /data
|
18 |
RUN ls -l /data
|
19 |
COPY ./tab2csv.py /code/tab2csv.py
|
20 |
-
COPY ./extractcollectorname.py code/extractcollectorname.py
|
21 |
-
RUN python tab2csv.py /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
|
22 |
-
RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
|
23 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
24 |
RUN ls -l /code
|
25 |
RUN sqlite-utils tables /code/gbifocc.db --counts
|
|
|
17 |
RUN unzip /data/gbif-occs.zip -d /data
|
18 |
RUN ls -l /data
|
19 |
COPY ./tab2csv.py /code/tab2csv.py
|
20 |
+
#COPY ./extractcollectorname.py /code/extractcollectorname.py
|
21 |
+
RUN python tab2csv.py --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
|
22 |
+
#RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
|
23 |
RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
|
24 |
RUN ls -l /code
|
25 |
RUN sqlite-utils tables /code/gbifocc.db --counts
|
tab2csv.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1 |
import argparse
|
2 |
import pandas as pd
|
|
|
|
|
|
|
3 |
|
4 |
if __name__ == '__main__':
|
5 |
parser = argparse.ArgumentParser()
|
6 |
parser.add_argument("inputfile")
|
|
|
7 |
parser.add_argument("outputfile")
|
8 |
args = parser.parse_args()
|
9 |
|
@@ -12,4 +16,13 @@ if __name__ == '__main__':
|
|
12 |
keep_default_na=False,
|
13 |
on_bad_lines='skip',
|
14 |
sep='\t')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
df.to_csv(args.outputfile, index=False, sep=',')
|
|
|
1 |
import argparse
|
2 |
import pandas as pd
|
3 |
+
import bananompy
|
4 |
+
from tqdm import tqdm
|
5 |
+
tqdm.pandas()
|
6 |
|
7 |
if __name__ == '__main__':
|
8 |
parser = argparse.ArgumentParser()
|
9 |
parser.add_argument("inputfile")
|
10 |
+
parser.add_argument("-createcols", action='store_true')
|
11 |
parser.add_argument("outputfile")
|
12 |
args = parser.parse_args()
|
13 |
|
|
|
16 |
keep_default_na=False,
|
17 |
on_bad_lines='skip',
|
18 |
sep='\t')
|
19 |
+
if args.createcols:
|
20 |
+
# Extract unique recordedBy values
|
21 |
+
df_rb = df[['recordedBy']].drop_duplicates()
|
22 |
+
df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
|
23 |
+
# Apply back to main dataframe
|
24 |
+
df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
|
25 |
+
# Add column holding collector name and number
|
26 |
+
mask = (df.recordNumber.notnull())
|
27 |
+
df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
|
28 |
df.to_csv(args.outputfile, index=False, sep=',')
|