Nicky Nicolson commited on
Commit
424fd94
·
1 Parent(s): 0c0ffb8

MOved create cols into format conv script

Browse files
Files changed (2) hide show
  1. Dockerfile +3 -3
  2. tab2csv.py +13 -0
Dockerfile CHANGED
@@ -17,9 +17,9 @@ RUN ls -l /data
17
  RUN unzip /data/gbif-occs.zip -d /data
18
  RUN ls -l /data
19
  COPY ./tab2csv.py /code/tab2csv.py
20
- COPY ./extractcollectorname.py code/extractcollectorname.py
21
- RUN python tab2csv.py /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
22
- RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
23
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
24
  RUN ls -l /code
25
  RUN sqlite-utils tables /code/gbifocc.db --counts
 
17
  RUN unzip /data/gbif-occs.zip -d /data
18
  RUN ls -l /data
19
  COPY ./tab2csv.py /code/tab2csv.py
20
+ #COPY ./extractcollectorname.py /code/extractcollectorname.py
21
+ RUN python tab2csv.py --createcols /data/${GBIF_DOWNLOAD_ID}.csv /data/gbifocc-temp.csv
22
+ #RUN python extractcollectorname.py /data/gbifocc-temp.csv /data/gbifocc.csv
23
  RUN csvs-to-sqlite /data/gbifocc.csv /code/gbifocc.db
24
  RUN ls -l /code
25
  RUN sqlite-utils tables /code/gbifocc.db --counts
tab2csv.py CHANGED
@@ -1,9 +1,13 @@
1
  import argparse
2
  import pandas as pd
 
 
 
3
 
4
  if __name__ == '__main__':
5
  parser = argparse.ArgumentParser()
6
  parser.add_argument("inputfile")
 
7
  parser.add_argument("outputfile")
8
  args = parser.parse_args()
9
 
@@ -12,4 +16,13 @@ if __name__ == '__main__':
12
  keep_default_na=False,
13
  on_bad_lines='skip',
14
  sep='\t')
 
 
 
 
 
 
 
 
 
15
  df.to_csv(args.outputfile, index=False, sep=',')
 
1
  import argparse
2
  import pandas as pd
3
+ import bananompy
4
+ from tqdm import tqdm
5
+ tqdm.pandas()
6
 
7
  if __name__ == '__main__':
8
  parser = argparse.ArgumentParser()
9
  parser.add_argument("inputfile")
10
+ parser.add_argument("-createcols", action='store_true')
11
  parser.add_argument("outputfile")
12
  args = parser.parse_args()
13
 
 
16
  keep_default_na=False,
17
  on_bad_lines='skip',
18
  sep='\t')
19
+ if args.createcols:
20
+ # Extract unique recordedBy values
21
+ df_rb = df[['recordedBy']].drop_duplicates()
22
+ df_rb['recordedBy_first_familyname'] = df_rb.recordedBy.progress_apply(getFirstFamilyName)
23
+ # Apply back to main dataframe
24
+ df = pd.merge(left = df, right=df_rb, left_on='recordedBy', right_on='recordedBy', how='left')
25
+ # Add column holding collector name and number
26
+ mask = (df.recordNumber.notnull())
27
+ df.loc[mask,'collectorNameAndNumber']=df[mask].apply(lambda row: '{} {}'.format(row['recordedBy_first_familyname'],row['recordNumber']),axis=1)
28
  df.to_csv(args.outputfile, index=False, sep=',')