TEMPS / notebooks /Comparison_methodology.py
lauracabayol's picture
new notebook versions
b25063d
raw
history blame
16.3 kB
# ---
# jupyter:
# jupytext:
# text_representation:
# extension: .py
# format_name: light
# format_version: '1.5'
# jupytext_version: 1.16.2
# kernelspec:
# display_name: temps
# language: python
# name: temps
# ---
# +
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from astropy.io import fits
import os
from astropy.table import Table
from temps.utils import nmad
from scipy import stats
from pathlib import Path
# -
#define here the directory containing the photometric catalogues
parent_dir = '/data/astro/scratch/lcabayol/EUCLID/DAz/DC2_results_to_share/'
# +
# List of FITS files to be processed
fits_files = [
'GDE_RF_full.fits',
'GDE_PHOSPHOROS_V2_full.fits',
'OIL_LEPHARE_full.fits',
'JDV_DNF_A_full.fits',
'JSP_FRANKENZ_full.fits',
'MBR_METAPHOR_full.fits',
'GDE_ADABOOST_full.fits',
'CSC_GPZ_best_full.fits',
'SFO_CPZ_full.fits',
'AAL_NNPZ_V3_full.fits'
]
# Corresponding redshift column names
redshift_columns = [
'REDSHIFT_RF',
'REDSHIFT_PHOSPHOROS',
'REDSHIFT_LEPHARE',
'REDSHIFT_DNF',
'REDSHIFT_FRANKENZ',
'REDSHIFT_METAPHOR',
'REDSHIFT_ADABOOST',
'REDSHIFT_GPZ',
'REDSHIFT_CPZ',
'REDSHIFT_NNPZ'
]
# Initialize an empty DataFrame for merging
merged_df = pd.DataFrame()
# Process each FITS file
for fits_file, redshift_col in zip(fits_files, redshift_columns):
print(fits_file)
# Open the FITS file
hdu_list = fits.open(os.path.join(parent_dir,fits_file))
df = Table(hdu_list[1].data).to_pandas()
df = df[df.REDSHIFT!=0]
df = df[['ID', 'VIS','SPECZ', 'REDSHIFT']].rename(columns={'REDSHIFT': redshift_col})
# Merge with the main DataFrame
if merged_df.empty:
merged_df = df
else:
merged_df = pd.merge(merged_df, df, on=['ID', 'VIS', 'SPECZ'], how='outer')
# -
# ## OPEN DATA
# +
modules_dir = Path('/data/astro/scratch/lcabayol/insight/data/Euclid_EXT_MER_PHZ_DC2_v1.5')
filename_valid='euclid_cosmos_DC2_S1_v2.1_valid_matched.fits'
hdu_list = fits.open(modules_dir/filename_valid)
cat_full = Table(hdu_list[1].data).to_pandas()
cat_full = cat_full[['ID','z_spec_S15','reliable_S15','mu_class_L07']]
merged_df['reliable_S15'] = cat_full.reliable_S15
merged_df['z_spec_S15'] = cat_full.z_spec_S15
merged_df['mu_class_L07'] = cat_full.mu_class_L07
merged_df['ID_catfull'] = cat_full.ID
# -
merged_df_specz = merged_df[(merged_df.z_spec_S15>0)&(merged_df.SPECZ>0)&(merged_df.reliable_S15==1)&(merged_df.mu_class_L07==1)&(merged_df.VIS!=np.inf)]
# ## ONLY SPECZ SAMPLE
scatter, outliers =[],[]
for im, method in enumerate(redshift_columns):
print(method)
df_method = merged_df_specz.dropna(subset=method)
zerr = (df_method.SPECZ - df_method[method] ) / (1 + df_method.SPECZ)
print(len(zerr[np.abs(zerr)>0.15]) /len(zerr))
scatter.append(nmad(zerr))
outliers.append(len(zerr[np.abs(zerr)>0.15]) / len(df_method))
# +
labs = [
'RF',
'PHOSPHOROS',
'LEPHARE',
'DNF',
'FRANKENZ',
'METAPHOR',
'ADABOOST',
'GPZ',
'CPZ',
'NNPZ',
]
# Colors from colormap
cmap = plt.get_cmap('tab20')
colors = [cmap(i / len(labs)) for i in range(len(labs))]
# Plotting
plt.figure(figsize=(10, 6))
for i in range(len(labs)):
plt.scatter(outliers[i]*100, scatter[i], color=colors[i], label=labs[i], marker = '^')
# Adding legend
plt.legend(fontsize=12)
plt.ylabel(r'NMAD $[\Delta z]$', fontsize=14)
plt.xlabel('Outlier fraction [%]', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(5,35)
plt.ylim(0,0.14)
# Display plot
plt.show()
# -
# ### ADD TEMPS PREDICTIONS
import torch
from temps.archive import Archive
from temps.temps_arch import EncoderPhotometry, MeasureZ
from temps.temps import TempsModule
# +
data_dir = Path('/data/astro/scratch/lcabayol/insight/data/Euclid_EXT_MER_PHZ_DC2_v1.5')
filename_valid='euclid_cosmos_DC2_S1_v2.1_valid_matched.fits'
hdu_list = fits.open(data_dir/filename_valid)
cat_phot = Table(hdu_list[1].data).to_pandas()
# -
cat_phot = cat_phot[cat_phot.ID.isin(merged_df_specz.ID_catfull)]
# +
photoz_archive = Archive(path = '/data/astro/scratch/lcabayol/insight/data/Euclid_EXT_MER_PHZ_DC2_v1.5',
only_zspec=True)
f, ferr = photoz_archive._extract_fluxes(catalogue= cat_phot)
col, colerr = photoz_archive._to_colors(f, ferr)
ID = cat_phot.ID
# +
modules_dir = Path('/nfs/pic.es/user/l/lcabayol/EUCLID/TEMPS/data/models')
nn_features = EncoderPhotometry()
nn_features.load_state_dict(torch.load(modules_dir / f'modelF_DA.pt',map_location=torch.device('cpu')))
nn_z = MeasureZ(num_gauss=6)
nn_z.load_state_dict(torch.load(modules_dir / f'modelZ_DA.pt', map_location=torch.device('cpu')))
temps_module = TempsModule(nn_features, nn_z)
z, pz, odds = temps_module.get_pz(input_data=torch.Tensor(col),
return_pz=True)
df = pd.DataFrame(np.c_[ID, z],
columns=['ID','TEMPS'])
df = df.dropna()
# -
merged_df_specz= merged_df_specz.merge(df, left_on='ID_catfull', right_on='ID')
# Corresponding redshift column names
redshift_columns = redshift_columns + ['TEMPS']
scatter, outliers =[],[]
for im, method in enumerate(redshift_columns):
print(method)
df_method = merged_df_specz.dropna(subset=method)
zerr = (df_method.SPECZ - df_method[method] ) / (1 + df_method.SPECZ)
print(len(zerr[np.abs(zerr)>0.15]) /len(zerr))
scatter.append(nmad(zerr))
outliers.append(len(zerr[np.abs(zerr)>0.15]) / len(df_method))
# +
labs = [
'RF',
'PHOSPHOROS',
'LEPHARE',
'DNF',
'FRANKENZ',
'METAPHOR',
'ADABOOST',
'GPZ',
'CPZ',
'NNPZ',
'TEMPS'
]
# Colors from colormap
cmap = plt.get_cmap('tab20')
colors = [cmap(i / len(labs)) for i in range(len(labs))]
# Plotting
plt.figure(figsize=(10, 6))
for i in range(len(labs)):
plt.scatter(outliers[i]*100, scatter[i], color=colors[i], label=labs[i], marker = '^')
# Adding legend
plt.legend(fontsize=12)
plt.ylabel(r'NMAD $[\Delta z]$', fontsize=14)
plt.xlabel('Outlier fraction [%]', fontsize=14)
plt.xticks(fontsize=14)
plt.yticks(fontsize=14)
plt.xlim(5,35)
plt.ylim(0,0.14)
# Display plot
plt.show()
# -
# ## ANOTHER SELECTION
# +
# List of FITS files to be processed
fits_files = [
'GDE_RF_full.fits',
'GDE_PHOSPHOROS_V2_full.fits',
'OIL_LEPHARE_full.fits',
'JDV_DNF_A_full.fits',
'JSP_FRANKENZ_full.fits',
'MBR_METAPHOR_full.fits',
'GDE_ADABOOST_full.fits',
'CSC_GPZ_best_full.fits',
'SFO_CPZ_full.fits',
'AAL_NNPZ_V3_full.fits'
]
# Corresponding redshift column names
redshift_columns = [
'REDSHIFT_RF',
'REDSHIFT_PHOSPHOROS',
'REDSHIFT_LEPHARE',
'REDSHIFT_DNF',
'REDSHIFT_FRANKENZ',
'REDSHIFT_METAPHOR',
'REDSHIFT_ADABOOST',
'REDSHIFT_GPZ',
'REDSHIFT_CPZ',
'REDSHIFT_NNPZ'
]
use_columns = [
'USE_RF',
'USE_PHOSPHOROS',
'USE_LEPHARE',
'USE_DNF',
'USE_FRANKENZ',
'USE_METAPHOR',
'USE_ADABOOST',
'USE_GPZ',
'USE_CPZ',
'USE_NNPZ'
]
# Initialize an empty DataFrame for merging
merged_df = pd.DataFrame()
# Process each FITS file
for fits_file, redshift_col,use_col in zip(fits_files, redshift_columns,use_columns):
print(fits_file)
# Open the FITS file
hdu_list = fits.open(os.path.join(parent_dir,fits_file))
df = Table(hdu_list[1].data).to_pandas()
df = df[df.REDSHIFT!=0]
df = df[['ID', 'VIS', 'SPECZ', 'REDSHIFT', 'L15PHZ', 'USE']].rename(columns={'REDSHIFT': redshift_col, 'USE': use_col})
# Merge with the main DataFrame
if merged_df.empty:
merged_df = df
else:
merged_df = pd.merge(merged_df, df, on=['ID', 'VIS', 'SPECZ','L15PHZ'], how='outer')
# -
merged_df['comp_z'] = np.where(merged_df['SPECZ'] > 0, merged_df['SPECZ'], merged_df['L15PHZ'])
#merged_df = merged_df[(merged_df.comp_z>0)&(merged_df.comp_z<4)&(merged_df.VIS>23.5)]
merged_df = merged_df[(merged_df.comp_z>0)&(merged_df.comp_z<4)&(merged_df.VIS<25)]
# +
modules_dir = Path('/data/astro/scratch/lcabayol/insight/data/Euclid_EXT_MER_PHZ_DC2_v1.5')
filename_valid='euclid_cosmos_DC2_S1_v2.1_valid_matched.fits'
hdu_list = fits.open(modules_dir/filename_valid)
cat_full = Table(hdu_list[1].data).to_pandas()
merged_df['ID_catfull'] = cat_full.ID
# +
data_dir = Path('/data/astro/scratch/lcabayol/insight/data/Euclid_EXT_MER_PHZ_DC2_v1.5')
filename_valid='euclid_cosmos_DC2_S1_v2.1_valid_matched.fits'
hdu_list = fits.open(data_dir/filename_valid)
cat_phot = Table(hdu_list[1].data).to_pandas()
# -
cat_phot = cat_phot[cat_phot.ID.isin(merged_df.ID_catfull)]
# +
photoz_archive = Archive(path = '/data/astro/scratch/lcabayol/insight/data/Euclid_EXT_MER_PHZ_DC2_v1.5',
only_zspec=False)
f, ferr = photoz_archive._extract_fluxes(catalogue= cat_phot)
col, colerr = photoz_archive._to_colors(f, ferr)
ID = cat_phot.ID
# +
modules_dir = Path('/nfs/pic.es/user/l/lcabayol/EUCLID/TEMPS/data/models')
nn_features = EncoderPhotometry()
nn_features.load_state_dict(torch.load(modules_dir/f'modelF_DA.pt',map_location=torch.device('cpu')))
nn_z = MeasureZ(num_gauss=6)
nn_z.load_state_dict(torch.load(modules_dir/f'modelZ_DA.pt',map_location=torch.device('cpu')))
temps_module = TempsModule(nn_features, nn_z)
z, pz, odds = temps_module.get_pz(input_data=torch.Tensor(col),
return_pz=True)
nn_features = EncoderPhotometry()
nn_features.load_state_dict(torch.load(modules_dir/f'modelF_z.pt',map_location=torch.device('cpu')))
nn_z = MeasureZ(num_gauss=6)
nn_z.load_state_dict(torch.load(modules_dir/f'modelZ_z.pt',map_location=torch.device('cpu')))
temps_module = TempsModule(nn_features, nn_z)
znoda, pz, odds_noda = temps_module.get_pz(input_data=torch.Tensor(col),
return_pz=True)
nn_features = EncoderPhotometry()
nn_features.load_state_dict(torch.load(modules_dir/f'modelF_L15.pt',map_location=torch.device('cpu')))
nn_z = MeasureZ(num_gauss=6)
nn_z.load_state_dict(torch.load(modules_dir/f'modelZ_L15.pt',map_location=torch.device('cpu')))
temps_module = TempsModule(nn_features, nn_z)
z_L15, pz, odds_L15 = temps_module.get_pz(input_data=torch.Tensor(col),
return_pz=True)
df = pd.DataFrame(np.c_[ID, z, odds, znoda, odds_noda,z_L15, odds_L15],
columns=['ID','TEMPS', 'flag_TEMPS', 'TEMPS_noda', 'flag_TEMPSnoda', 'TEMPS_L15', 'flag_L15'])
df = df.dropna()
# +
percent=0.3
df['USE_TEMPS'] = np.zeros(shape=len(df))
# Calculate the 50th percentile (median) value of 'Flag_temps'
threshold = df['flag_TEMPS'].quantile(percent)
# Set 'USE_TEMPS' to 1 if 'Flag_temps' is in the top 50% (greater than or equal to the threshold)
df['USE_TEMPS'] = np.where(df['flag_TEMPS'] >= threshold, 1, 0)
# +
percent=0.3
df['USE_TEMPS_noda'] = np.zeros(shape=len(df))
# Calculate the 50th percentile (median) value of 'Flag_temps'
threshold = df['flag_TEMPSnoda'].quantile(percent)
# Set 'USE_TEMPS' to 1 if 'Flag_temps' is in the top 50% (greater than or equal to the threshold)
df['USE_TEMPS_noda'] = np.where(df['flag_TEMPSnoda'] >= threshold, 1, 0)
# +
percent=0.3
df['USE_TEMPS_L15'] = np.zeros(shape=len(df))
# Calculate the 50th percentile (median) value of 'Flag_temps'
threshold = df['flag_L15'].quantile(percent)
# Set 'USE_TEMPS' to 1 if 'Flag_temps' is in the top 50% (greater than or equal to the threshold)
df['USE_TEMPS_L15'] = np.where(df['flag_L15'] >= threshold, 1, 0)
# -
merged_df_temps = merged_df.merge(df, left_on='ID_catfull', right_on='ID')
# Corresponding redshift column names
redshift_columns = [
'REDSHIFT_RF',
'REDSHIFT_PHOSPHOROS',
'REDSHIFT_LEPHARE',
'REDSHIFT_DNF',
'REDSHIFT_FRANKENZ',
'REDSHIFT_METAPHOR',
'REDSHIFT_ADABOOST',
'REDSHIFT_GPZ',
'REDSHIFT_CPZ',
'REDSHIFT_NNPZ'
]
redshift_columns = redshift_columns + ['TEMPS', 'TEMPS_noda', 'TEMPS_L15']
use_columns = use_columns + ['USE_TEMPS','USE_TEMPS_noda', 'USE_TEMPS_L15']
merged_df_temps = merged_df_temps[merged_df_temps.VIS <25]
scatter, outliers, size =[],[], []
for method, use in(zip(redshift_columns, use_columns)):
print(method)
#df_method = merged_df_temps.dropna(subset=method)
df_method = merged_df_temps[(merged_df_temps.loc[:, method]>0.2)&(merged_df_temps.loc[:, method]<2.6)]
df_method = df_method[df_method.VIS<24.5]
norm_size = len(df_method)
df_method = df_method[df_method.loc[:, use]==1]
zerr = (df_method.comp_z - df_method[method] ) / (1 + df_method.comp_z)
scatter.append(nmad(zerr))
outliers.append(len(zerr[np.abs(zerr)>0.15]) / len(df_method))
size.append(len(df_method)/norm_size)
print(nmad(zerr),len(zerr[np.abs(zerr)>0.15]) / len(df_method), len(df_method) /norm_size )
scatter_faint, outliers_faint, size_faint =[],[], []
for method, use in(zip(redshift_columns, use_columns)):
print(method)
#df_method = merged_df_temps.dropna(subset=method)
df_method = merged_df_temps[(merged_df_temps.loc[:,'VIS']>23.5)&(merged_df_temps.loc[:,'VIS']<25)]
#df_method = df_method[df_method.loc[:, use]==1]
#df_method = merged_df_temps[(merged_df_temps.loc[:,'VIS']>23.5)&(merged_df_temps.loc[:,'VIS']<24.5)]
zerr = (df_method.comp_z - df_method[method] ) / (1 + df_method.comp_z)
scatter_faint.append(nmad(zerr))
outliers_faint.append(len(zerr[np.abs(zerr)>0.15]) / len(df_method))
size_faint.append(len(df_method))
print(nmad(zerr),len(zerr[np.abs(zerr)>0.15]) / len(df_method), len(df_method))
# +
import matplotlib.pyplot as plt
import numpy as np
from pastamarkers import markers
# Define labels for the models
labs = [
'RF', 'PHOSPHOROS', 'LEPHARE', 'DNF', 'FRANKENZ', 'METAPHOR',
'ADABOOST', 'GPZ', 'CPZ', 'NNPZ', 'TEMPS', 'TEMPS - no DA', 'TEMPS - L15'
]
markers_pasta = [markers.penne, markers.conchiglie, markers.tortellini, markers.creste, markers.spaghetti, markers.ravioli, markers.tagliatelle, markers.mezzelune,markers.puntine, markers.stelline , 's', 'o', '^']
labs_faint = [f"{lab}_faint" for lab in labs] # Labels for the faint data
# Colors from colormap
cmap = plt.get_cmap('tab20')
colors = [cmap(i / len(labs)) for i in range(len(labs))]
# Create subplots with 2 panels stacked vertically
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10, 12), sharex=False)
# Plotting for the top panel
for i in range(len(labs)):
if labs[i] == 'TEMPS - no DA' or labs[i] == 'TEMPS - L15':
ax1.scatter(np.nan, np.nan, color=colors[i], label=labs[i], marker=markers_pasta[i], s=300)
elif labs[i]=='CPZ':
ax1.scatter(outliers[i] * 100, scatter[i], color=colors[i], label=labs[i], marker=markers_pasta[i], s=300)
ax1.text(outliers[i] * 100 -0.2, scatter[i] + 0.001, f'{int(np.around(size[i] * 100))}', fontsize=12, verticalalignment='bottom')
elif labs[i]=='ADABOOST':
ax1.scatter(outliers[i] * 100, scatter[i], color=colors[i], label=labs[i], marker=markers_pasta[i], s=300)
ax1.text(outliers[i] * 100 - 0.5, scatter[i] - 0.004, f'{int(np.around(size[i] * 100))}', fontsize=12, verticalalignment='bottom')
else:
ax1.scatter(outliers[i] * 100, scatter[i], color=colors[i], label=labs[i], marker=markers_pasta[i], s=300)
ax1.text(outliers[i] * 100 - 0.5, scatter[i] + 0.001, f'{int(np.around(size[i] * 100))}', fontsize=12, verticalalignment='bottom')
# Customizations for the top plot
ax1.set_ylabel(r'NMAD $[\Delta z]$', fontsize=24)
ax1.legend(fontsize=14)
ax1.tick_params(axis='both', which='major', labelsize=20)
# Plotting for the bottom panel (faint data)
for i in range(len(labs)):
ax2.scatter(outliers_faint[i] * 100, scatter_faint[i], color=colors[i], label=labs[i], marker=markers_pasta[i], s=300)
# Customizations for the bottom plot
ax2.set_ylabel(r'NMAD $[\Delta z]$', fontsize=24)
ax2.set_xlabel('Outlier fraction [%]', fontsize=24)
ax2.tick_params(axis='both', which='major', labelsize=20)
# Display the plot
plt.tight_layout()
#plt.savefig('Comparison_paper.pdf', bbox_inches='tight')
plt.show()
# -
cat_val_z = cat_val[['RA','DEC']].merge(cat_all[['RA','DEC','z_spec_S15','photo_z_L15','reliable_S15','mu_class_L07']], on = ['RA','DEC'])
merged_df = merged_df.merge(cat_val_z, on = ['RA','DEC'])