The evaluate_fit
command calculates 6 error statistics, including the stats reported in the manuscript.
Smaller values are better.
!sfacts evaluate_fit \
--num-format '%.5f' \
--scores mgen_error fwd_genotype_error rev_genotype_error bc_error unifrac_error entropy_error \
--outpath sim.eval_all_fits.tsv \
sim.world.nc \
sim.filt.ss-0.fit.world.nc \
sim.filt.ss-0.fit2.world.nc \
sim.filt.fit3.world.nc
!column -t sim.eval_all_fits.tsv
score sim.world.nc sim.filt.ss-0.fit.world.nc sim.filt.ss-0.fit2.world.nc sim.filt.fit3.world.nc mgen_error 0.01831 0.01824 0.01804 0.01766 fwd_genotype_error 0.00000 0.00000 0.00000 0.00000 rev_genotype_error 0.00000 0.00001 0.00177 0.00196 bc_error 0.00000 0.00618 0.00833 0.00833 unifrac_error 0.00000 0.04412 0.04649 0.06635 entropy_error 0.00000 0.02890 0.04393 0.04393
We can visualize these on a per-sample or per-strain basis as well.
import sfacts as sf
import xarray as xr
sim = sf.World.load('sim.world.nc')
fit = sf.World.load('sim.filt.ss-0.fit.world.nc')
fit = sf.World(fit.data.assign_coords(position=fit.position))
sim = sim.sel(position=fit.position, sample=fit.sample)
mgen_error = sf.evaluation.metagenotype_error2(fit, discretized=True)
fwd_genotype_error = sf.evaluation.discretized_weighted_genotype_error(sim, fit)
rev_genotype_error = sf.evaluation.discretized_weighted_genotype_error(fit, sim)
bc_error = sf.evaluation.braycurtis_error(sim, fit)
unifrac_error = sf.evaluation.unifrac_error(sim, fit)
entropy_error = sf.evaluation.community_entropy_error(sim, fit)
sf.plot.plot_community(
sim,
col_linkage_func=lambda w: w.metagenotype.linkage('sample'),
row_linkage_func=lambda w: w.genotype.linkage('strain'),
col_colors_func=lambda w: xr.Dataset(dict(
mg=mgen_error[1],
bc=bc_error[1],
uf=unifrac_error[1],
ent=entropy_error[1].abs(),
)),
row_colors_func=lambda w: xr.Dataset(dict(
fwd=fwd_genotype_error[1],
)),
)
sf.plot.plot_community(
fit,
col_linkage_func=lambda w: w.metagenotype.linkage('sample'),
row_linkage_func=lambda w: w.genotype.linkage('strain'),
col_colors_func=lambda w: xr.Dataset(dict(
mg=mgen_error[1],
bc=bc_error[1],
uf=unifrac_error[1],
ent=entropy_error[1].abs(),
)),
row_colors_func=lambda w: xr.Dataset(dict(
rev=rev_genotype_error[1],
)),
)
sf.plot.plot_metagenotype(
fit,
col_linkage_func=lambda w: w.metagenotype.linkage('sample'),
row_linkage_func=lambda w: w.metagenotype.linkage('position'),
col_colors_func=lambda w: xr.Dataset(dict(
mg=mgen_error[1],
bc=bc_error[1],
uf=unifrac_error[1],
ent=entropy_error[1].abs(),
)),
row_colors_func=lambda w: xr.Dataset(dict(
_=w.metagenotype.mean_depth("position"),
)),
)
sf.plot.plot_genotype(
fit,
row_linkage_func=lambda w: w.genotype.linkage('strain'),
col_linkage_func=lambda w: w.metagenotype.linkage('position'),
row_colors_func=lambda w: xr.Dataset(dict(
rev=rev_genotype_error[1],
)),
)
sf.plot.plot_genotype(
sim,
row_linkage_func=lambda w: w.genotype.linkage('strain'),
col_linkage_func=lambda w: w.metagenotype.linkage('position'),
row_colors_func=lambda w: xr.Dataset(dict(
fwd=fwd_genotype_error[1],
)),
)
<seaborn.matrix.ClusterGrid at 0x145d46f90>
import scipy as sp
import matplotlib.pyplot as plt
mg_dist = sim.metagenotype.pdist()
uf_dist = fit.unifrac_pdist()
plt.scatter(sp.spatial.distance.squareform(mg_dist), sp.spatial.distance.squareform(uf_dist))
sp.stats.spearmanr(sp.spatial.distance.squareform(mg_dist), sp.spatial.distance.squareform(uf_dist))
SignificanceResult(statistic=0.5460012636245404, pvalue=4.081769548834536e-96)
This simulate/fit/evaluate loop gives us an opportunity to select better parameters than the defaults. Likewise, in real data we can harness our expectations about strain diversity (across samples), strain heterogeneity (within samples), and genotype ambiguity (a.k.a. fuzzyness) to pick better parameters for a real-world dataset.
The regularization provided by priors is tunable and can be used to get more reasonable estimates.
Generally speaking, more of one type of regularization
(smaller values of gamma_hyper
, pi_hyper
, or rho_hyper
) will result in
less effective regularization of the other two.