16S扩增子分析
背景
16S 选择测序区域 V3+V4
ITS测序主要是针对真菌多样性的研究,注释准确度高;
18S测序针对的是真核微生物,注释到的范围广,但是就真菌来说精确度相对较低
流程
1.原始数据到OTU表格
# 双端测序数据合并
# Barcode 引物去除
# 去除嵌合体序列
# 聚类生成OUT
# 物种注释
# OTU 过滤
# 数据标准化
OTU是人为给某一个分类单元(品系,种,属,分组等)设置的同一标志。
要了解一个样品测序结果中的菌种、菌属等数目信息,
就需要对序列进行聚类操作(cluster)。通过聚类操作,将
序列按照彼此的相似性分归为许多小组,一个小组就是一个OTU。
通常在97%的相似水平下聚类生成OTU,选择每个聚类群中最高丰度序列作
为代表性序列。发现100%更合理,即不聚类的ASV更容易实现跨研究比较。
#############################################################################
2.物种多样性分析
# 物种丰度
# alpha多样性
# beta 多样性
#############################################################################
3.OTU物种丰度差异分析
#############################################################################
4.PICRUSt细菌功能预测
#############################################################################
文章地址:https://www.nature.com/articles/s41559-019-1063-3
1. 数据合并
####docker run -it -m 8G --cpus 8 --rm -v E:/qime16:/work Kjd/amplseq
dbdir=/work/database
workdir=/work/my_16s
datadir=$workdir/data
scriptdir=$workdir/scripts
fastmap=$workdir/fastmap.txt
mkdir /work/tmp
export TMPDIR=/work/tmp #防止临时目录存储 不够
export PATH=$workdir/scripts:$PATH #添加代码到环境PATH中
######################database
#各大数据库地址:
silva_16S_97_seq=$dbdir/SILVA_132_QIIME_release/rep_set/rep_set_16S_only/97/silva_132_97_16S.fna
silva_16S_97_tax=$dbdir/SILVA_132_QIIME_release/taxonomy/16S_only/97/taxonomy_7_levels.txt
greengene_16S_97_seq=$dbdir/gg_13_8_otus/rep_set/97_otus.fasta
greengene_16S_97_tax=$dbdir/gg_13_8_otus/taxonomy/97_otu_taxonomy.txt
silva_18S_97_seq=$dbdir/SILVA_132_QIIME_release/rep_set/rep_set_18S_only/97/silva_132_97_18S.fna
silva_18S_97_tax=$dbdir/SILVA_132_QIIME_release/taxonomy/18S_only/97/taxonomy_7_levels.txt
unite_ITS_97_seq=$dbdir/unite_ITS8.2/sh_refs_qiime_ver8_97_04.02.2020.fasta
unite_ITS_97_tax=$dbdir/unite_ITS8.2/sh_taxonomy_qiime_ver8_97_04.02.2020.txt
#选择使用的数据库
SEQ=$silva_16S_97_seq
TAX=$silva_16S_97_tax
cd $workdir #回到工作目录
mkdir 1.merge_pe
for i in `cat $fastmap |grep -v '#'|cut -f 1` ;do
flash $datadir/${i}_1.fastq.gz $datadir/${i}_2.fastq.gz \
-m 10 -x 0.2 -p 33 -t 1 \
-o $i -d 1.merge_pe
done
2.对原始数据进行fastqc质控
cd $workdir #回到工作目录
mkdir 2.fastqc
#fastqc查看数据质量分布等
fastqc -t 2 $workdir/1.merge_pe/*extendedFrags.fastq -o $workdir/2.fastqc
#质控结果汇总
cd $workdir/2.fastqc
multiqc .
3.数据质控:对原始序列进行去接头,删除低质量的reads
cd $workdir #回到工作目录
mkdir 3.data_qc
cd 3.data_qc
#利用fastp工具去除adapter
#--qualified_quality_phred the quality value that a base is qualified.
# Default 15 means phred quality >=Q15 is qualified. (int [=15])
#--unqualified_percent_limit how many percents of bases are allowed to be unqualified
#--n_base_limit if one read's number of N base is >n_base_limit,
# then this read/pair is discarded
#--detect_adapter_for_pe 接头序列未知 可设置软件自动识别常见接头
#
for i in `cat $fastmap |grep -v '#'|cut -f 1`; do
fastp --thread 1 --qualified_quality_phred 10 \
--unqualified_percent_limit 50 \
--n_base_limit 10 \
--length_required 300 \
--trim_front1 29 \
--trim_tail1 18 \
-i $workdir/1.merge_pe/${i}.extendedFrags.fastq \
-o ${i}.clean_tags.fq.gz \
--detect_adapter_for_pe -h ${i}.html -j ${i}.json
done
#质控数据统计汇总:
qc_stat.py -d $workdir/3.data_qc/ -o $workdir/3.data_qc/ -p all_sample_qc
## qc_stat.py
import argparse, os, json, glob
parser = argparse.ArgumentParser(description='This script was used to stat qc result')
parser.add_argument("-d", '--qcdir', dest='qcdir', required=True, help='set fastp qc dir')
parser.add_argument("-o", '--outdir', dest='outdir', required=False, default=os.getcwd(),
help='specify the output file dir,default is current dir')
parser.add_argument("-p", '--prefix', dest='prefix', required=False, default='all_sample_qc',
help='file name prefix,default all_sample_qc')
args = parser.parse_args()
if not os.path.exists(args.outdir): os.mkdir(args.outdir)
name = os.path.abspath(args.outdir) + '/' + args.prefix
args.qcdir = os.path.abspath(args.qcdir)
files = glob.glob(args.qcdir + "/*json")
assert len(files) > 0, "failed to find fastp json files in dir: %s" % args.qcdir
outfile = open(name + ".tsv", "w")
outfile.write("SampleID\trawDataReads\tcleanDataReads\trawDataBase\tcleanDataBase\tQ20\tQ30\tGC\n")
for i in sorted(files):
(filepath, tempfilename) = os.path.split(i)
(samplename, extension) = os.path.splitext(tempfilename)
f = open(i, "r")
qcjs = json.load(f)
rawDataReads = format(qcjs["summary"]["before_filtering"]["total_reads"], ",") # 一对reads 记为两条
rawDataBase = format(qcjs["summary"]["before_filtering"]["total_bases"], ",")
cleanDataReads = format(qcjs["summary"]["after_filtering"]["total_reads"], ",")
cleanDataBase = format(qcjs["summary"]["after_filtering"]["total_bases"], ",")
Q20 = format(qcjs["summary"]["after_filtering"]["q20_rate"] * 100, '0.2f')
Q30 = format(qcjs["summary"]["after_filtering"]["q30_rate"] * 100, '0.2f')
GC = format(qcjs["summary"]["after_filtering"]["gc_content"] * 100, '0.2f')
outfile.write(
"\t".join([samplename, rawDataReads, cleanDataReads, rawDataBase, cleanDataBase, Q20, Q30, GC]) + "\n")
f.close()
outfile.close()
4.去除嵌合体
cd $workdir #回到工作目录
mkdir 4.remove_chimeras
cd 4.remove_chimeras
#去除嵌合体
for i in `cat $fastmap |grep -v '#'|cut -f 1`; do
#相同重复序列合并
vsearch --derep_fulllength $workdir/3.data_qc/${i}.clean_tags.fq.gz \
--sizeout --output ${i}.derep.fa
#去嵌合体
vsearch --uchime3_deno ${i}.derep.fa \
--sizein --sizeout \
--nonchimeras ${i}.denovo.nonchimeras.rep.fa
#相同序列还原为多个
vsearch --rereplicate ${i}.denovo.nonchimeras.rep.fa --output ${i}.denovo.nonchimeras.fa
done
#根据参考序列去除嵌合体(真菌没有参考序列这步不用)
for i in `cat $fastmap |grep -v '#'|cut -f 1`; do
vsearch --uchime_ref ${i}.denovo.nonchimeras.fa \
--db $dbdir/rdp_gold.fa \
--sizein --sizeout --fasta_width 0 \
--nonchimeras ${i}.ref.nonchimeras.fa
done
5.qiime1分析pic otu 聚类方法
cd $workdir #回到工作目录
mkdir 5.pick_otu_qiime
cd 5.pick_otu_qiime
#合并fasta文件,并加序列号
for i in `cat $fastmap |grep -v '#'|cut -f 1`; do
format_fa_for_qiime.pl -f $workdir/4.remove_chimeras/$i.ref.nonchimeras.fa \
-n $i -out $i.fa
done
#合并fa文件到qiime.fasta 之后删除所有单个样本的fa文件
cat *fa >qiime.fasta
rm -f *fa
### 方法一
### 输出qiime pick otu 参数http://qiime.org/scripts/pick_otus.html
echo pick_otus:denovo_otu_id_prefix OTU > otu_params_de_novo.txt
echo pick_otus:similarity 0.97 >> otu_params_de_novo.txt
echo pick_otus:otu_picking_method usearch61 >> otu_params_de_novo.txt #sortmerna, mothur, trie, uclust_ref, usearch, usearch_ref, blast, usearch61, usearch61_ref,sumaclust, swarm, prefix_suffix, cdhit, uclust.
echo assign_taxonomy:reference_seqs_fp $SEQ >> otu_params_de_novo.txt
echo assign_taxonomy:id_to_taxonomy_fp $TAX >> otu_params_de_novo.txt
echo assign_taxonomy:similarity 0.8 >>otu_params_de_novo.txt
echo assign_taxonomy:assignment_method uclust >>otu_params_de_novo.txt # rdp, blast,rtax, mothur, uclust, sortmerna
pick_de_novo_otus.py -i qiime.fasta -f -o pick_de_novo_otus \
-p otu_params_de_novo.txt
### 方法二
echo pick_otus:denovo_otu_id_prefix OTU > otu_params_open_reference.txt
echo pick_otus:similarity 0.97 >> otu_params_open_reference.txt
echo assign_taxonomy:reference_seqs_fp $SEQ >> otu_params_open_reference.txt
echo assign_taxonomy:id_to_taxonomy_fp $TAX >> otu_params_open_reference.txt
echo assign_taxonomy:similarity 0.8 >>otu_params_open_reference.txt
echo assign_taxonomy:assignment_method uclust >>otu_params_open_reference.txt # rdp, blast,rtax, mothur, uclust, sortmerna #如果是ITS/18S数据,数据库更改为UNITE,assignment_method方法改为blast。详细使用说明,请读官方文档http://qiime.org/scripts/assign_taxonomy.html
pick_open_reference_otus.py --otu_picking_method usearch61 -i qiime.fasta -r $SEQ \
-f -o pick_open_reference_otus -p otu_params_open_reference.txt \
--min_otu_size 2 -s 0.1
#ITS/18S分析 添加参数 --suppress_align_and_tree
### 方法三
echo pick_otus:similarity 0.97 > otu_params_closed_reference.txt
echo pick_otus:otu_picking_method usearch61_ref >> otu_params_closed_reference.txt #uclust_ref,usearch_ref, usearch61_ref
echo assign_taxonomy:similarity 0.8 >>otu_params_closed_reference.txt
echo assign_taxonomy:assignment_method uclust >>otu_params_closed_reference.txt # rdp, blast,rtax, mothur, uclust, sortmerna #如果是ITS/18S数据,数据库更改为UNITE,assignment_method方法改为blast。详细使用说明,请读官方文档http://qiime.org/scripts/assign_taxonomy.html
pick_closed_reference_otus.py -i qiime.fasta -f -r $SEQ -t $TAX \
-o pick_closed_reference_otus -p otu_params_closed_reference.txt -s
###以 denovo方法得到的结果进行下游分析
#过滤稀有OUT 0.005%
biom convert -i pick_de_novo_otus/otu_table.biom \
-o pick_de_novo_otus/otu_table.txt \
--to-tsv --header-key taxonomy
filter_otus_from_otu_table.py -i pick_de_novo_otus/otu_table.biom \
-o pick_de_novo_otus/otu_table_filtered.biom --min_count_fraction 0.00005
#过滤不必要的分类物种
filter_taxa_from_otu_table.py -o pick_de_novo_otus/otu_table_filtered_species.biom \
-i pick_de_novo_otus/otu_table_filtered.biom -n "f__mitochondria,c__Chloroplast,f__Unknown family,Unassigned,k__Eukaryota,k__Archaea,o__Chloroplast,k__Plantae"
#排序一下otu table
sort_otu_table.py -i pick_de_novo_otus/otu_table_filtered_species.biom \
-o pick_de_novo_otus/otu_table_clean.biom
###标准化数据方法:
#等量重抽样标准化法table,再用于后续比较;按最低样品count数抽平
biom summarize-table -i pick_de_novo_otus/otu_table_clean.biom
single_rarefaction.py -i pick_de_novo_otus/otu_table_clean.biom \
-o pick_de_novo_otus/otu_table_clean_rare.biom -d 1722 ### 这里选取的是样本中最低的count数目,实际可以舍弃低count数的样本
biom convert -i pick_de_novo_otus/otu_table_clean_rare.biom \
-o pick_de_novo_otus/otu_table_clean_rare.txt \
--to-tsv --header-key taxonomy
###其他标准化方法(选做) https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4010126/
normalize_table.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean.biom \
-a CSS -o pick_de_novo_otus/otu_table_clean_css.biom
biom convert -i pick_de_novo_otus/otu_table_clean_css.biom \
-o pick_de_novo_otus/otu_table_clean_css.txt \
--to-tsv --header-key taxonomy
normalize_table.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean.biom \
-a DESeq2 -z -o pick_de_novo_otus/otu_table_clean_deseq2_norm_no_negatives.biom
biom convert -i pick_de_novo_otus/otu_table_clean_deseq2_norm_no_negatives.biom \
-o pick_de_novo_otus/otu_table_clean_deseq2_norm_no_negatives.txt \
--to-tsv --header-key taxonomy
#生成不同分类水平丰度表格
summarize_taxa.py -i pick_de_novo_otus/otu_table_clean_rare.biom \
-o taxa_summary/relative_abundance --level 2,3,4,5,6,7
summarize_taxa.py -i pick_de_novo_otus/otu_table_clean_rare.biom \
-o taxa_summary/absolute_abundance -a --level 2,3,4,5,6,7
#按分组,生成不同分类水平丰度表格
mkdir group_otus
for i in city loc country;do
collapse_samples.py -b pick_de_novo_otus/otu_table_clean_rare.biom -m $fastmap \
--collapse_fields $i --output_mapping_fp group_otus/group.$i.txt \
--output_biom_fp group_otus/otu_table_$i.biom
biom convert -i group_otus/otu_table_$i.biom -o group_otus/otu_table_${i}.txt \
--to-tsv --header-key taxonomy
summarize_taxa.py -i group_otus/otu_table_$i.biom \
-o group_otus/taxa_summary_relative.${i} \
-L 2,3,4,5,6,7
summarize_taxa.py -i group_otus/otu_table_$i.biom \
-o group_otus/taxa_summary_absolute.${i} \
-a -L 2,3,4,5,6,7
done
######一些有用的命令使用
#biom文件转换成表格,方便后续分析与查看
#biom convert -i pick_de_novo_otus/otu_table_clean.biom \
# -o pick_de_novo_otus/otu_table_clean.txt --to-tsv --header-key taxonomy
#biom convert -i pick_de_novo_otus/otu_table_clean_rare.biom \
# -o pick_de_novo_otus/otu_table_clean_rare.txt --to-tsv --header-key taxonomy
##添加meta信息
#biom add-metadata -i pick_de_novo_otus/otu_table_clean_rare.biom \
# -o pick_de_novo_otus/otu_table_clean_rare_metadata.biom -m $fastmap
#过滤一些离群样品使用脚本:
#filter_samples_from_otu_table.py
#分割不同条件的数据,取数据子集进行分析:
#split_otu_table.py
### format_fa_for_qiime.pl
use strict;
use Cwd qw(abs_path getcwd);
use Getopt::Long;
use Bio::SeqIO;
use Bio::Seq;
my ($od,$fasta,$name,$out);
GetOptions(
"help|?" =>\&USAGE,
'f=s'=>\$fasta,
'n=s'=>\$name,
"od=s"=>\$od,
"out=s"=>\$out
) or &USAGE;
&USAGE unless ($fasta and $name);
sub USAGE {
my $usage=<<"USAGE";
Program: $0
Contact: huangls
Discription:
Usage:
Options:
-f <file> Input fa file, required
-n <str> sample name ,required
-out <str> output file name optional
-od <dir> output dir,optional
-h Help
Example:
perl $0 -fasta BC2.fasta -name BC1 -out qiime.fa
USAGE
print $usage;
exit 1;
}
$od||=getcwd;
mkdir $od unless(-d $od);
$out||="$name.fa";
mkdir "$od/" unless(-d "$od/");
my$o = Bio::SeqIO->new(-file => ">$od/$out" ,
-format => 'Fasta');
my$in="";
my $FI;
if ($fasta=~/gz$/){
open $FI, "<:gzip", "$fasta" or die "$!";
$in= Bio::SeqIO->new(-file => $FI ,
-alphabet=>"dna",
-format => 'Fasta');
}else{
$in= Bio::SeqIO->new(-file => "$fasta" ,
-alphabet=>"dna",
-format => 'Fasta');
}
my$i=0;
while ( my $seq = $in->next_seq() ) {
$i++;
my $id = $seq->id;
my $new_seq=Bio::Seq->new(-id=>"${name}_$i",-seq=>$seq->seq);
$o->write_seq($new_seq);
}
$in->close();
$o->close();
5.usearch denoise 生成ASV特征表 (跟上面的qiime软件生成的otu一样)
cd $workdir #回到工作目录
mkdir 5.usearch_asv
cd 5.usearch_asv
#转换成usearch格式
sed '/^>/s#_\([0-9]*\)$#\.\1#' $workdir/5.pick_otu_qiime/qiime.fasta >all_usearch.fa
#qiimefa2usearch.pl $workdir/5.pick_otu_qiime/qiime.fasta all_usearch.fa
#重复序列合并 Dereplication,名称后有size数量
vsearch --derep_fulllength all_usearch.fa --output uniques.fa \
--relabel Uni --minuniquesize 10 --sizeout
#推荐unoise3去噪获得单碱基精度ASV , 由于64位收费,文件过大会报错
#ASV去噪 Denoise: predict biological sequences and filter chimeras
usearch -unoise3 uniques.fa -zotus zotus.fa #数据量大可能报错 ,可用上一步minuniquesize 增加,减少数据量再试
#修改序列名:Zotu为ASV方便识别,ASV代表序列otus.fa
sed 's/Zotu/ASV_/g' zotus.fa > otus.fa
#生成特征表
###方法1 usearch比对更快,由于64位收费,文件过大会报错,可选vsearch替代,见下方方法2
#usearch -otutab all_usearch.fa -otus otus.fa -otutabout otu_table.txt -threads 4
###方法2
vsearch --usearch_global all_usearch.fa --db otus.fa \
--otutabout otu_table.txt --id 0.97 --threads 4
#转换成biom格式,方便后续添加物种注释信息
biom convert -i otu_table.txt -o otu_table.biom \
--table-type="OTU table" --to-hdf5
#物种分类注释
#物种注释,名字上添加xx 避免assign_taxonomy.py 报错
sed -i 's#\(ASV_[0-9]*\)#\1 xx#' otus.fa
assign_taxonomy.py -o uclust_assigned_taxonomy -i otus.fa -m uclust \
--reference_seqs_fp $SEQ --id_to_taxonomy_fp $TAX --similarity 0.8
#或者用RDP,需要大内存
#assign_taxonomy.py -o rdp_assigned_taxonomy -i otus.fa -m rdp \
# --reference_seqs_fp $SEQ --id_to_taxonomy_fp $TAX \
# --rdp_max_memory 20000 --confidence 0.5
#合并丰度表格与物种注释表,生成biom文件
#添加表头
sed -i '1i#OTUID\ttaxonomy\tConfidence\tnum' uclust_assigned_taxonomy/otus_tax_assignments.txt
#添加物种注释信息
biom add-metadata -i otu_table.biom --observation-metadata-fp uclust_assigned_taxonomy/otus_tax_assignments.txt \
--sc-separated taxonomy -o otu_table_tax.biom
biom convert -i otu_table_tax.biom -o otu_table_tax.txt --to-tsv --header-key taxonomy
#构建进化树,qiime方法
# Align sequences command
align_seqs.py -i otus.fa -o pynast_aligned_seqs
# Filter alignment command
filter_alignment.py -o pynast_aligned_seqs -i pynast_aligned_seqs/otus_aligned.fasta
# Build phylogenetic tree command
make_phylogeny.py -i pynast_aligned_seqs/otus_aligned_pfiltered.fasta -o rep_set.tre
#有了otu table otu_table_tax.biom ,代表序列:otus.fa 进化树 rep_set.tre 就可以接后面的qiime分析了;
#过滤稀有OUT 0.005%
filter_otus_from_otu_table.py -i otu_table_tax.biom -o otu_table_filtered.biom --min_count_fraction 0.00005
#过滤不必要的分类物种
filter_taxa_from_otu_table.py -o otu_table_filtered_species.biom -i otu_table_filtered.biom -n "f__mitochondria,c__Chloroplast,f__Unknown family,Unassigned,k__Eukaryota,k__Archaea,o__Chloroplast,k__Plantae"
#排序一下otu table
sort_otu_table.py -i otu_table_filtered_species.biom -o otu_table_clean.biom
###标准化数据方法:
#等量重抽样标准化法table,再用于后续比较;按最低样品count数抽平
biom summarize-table -i otu_table_clean.biom
single_rarefaction.py -i otu_table_clean.biom -o otu_table_clean_rare.biom -d 2032
summarize_taxa.py -i otu_table_clean_rare.biom -o taxa_summary/absolute_abundance -a --level 2,3,4,5,6,7
summarize_taxa.py -i otu_table_clean_rare.biom -o taxa_summary/relative_abundance --level 2,3,4,5,6,7
biom convert -i otu_table_clean.biom -o otu_table_clean.txt --to-tsv --header-key taxonomy
biom convert -i otu_table_clean_rare.biom -o otu_table_clean_rare.txt --to-tsv --header-key taxonomy
6.微生物物种丰度及组成
cd $workdir
mkdir 6.OTUanalysis
cd 6.OTUanalysis
###物种累计丰度曲线
species_accumulation_curve.r -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.txt -n species_accumulation_curve
#Phylum, Class, Order, Family, Genus, Species
#所有样品的柱状图相对丰度
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L2.txt -t Phylum --name Phylum_tax_bar
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L3.txt -t Class --name Class_tax_bar
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L4.txt -t Order --name Order_tax_bar
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L5.txt -t Family --name Family_tax_bar
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L6.txt -t Genus --name Genus_tax_bar
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L7.txt -t Species --name Species_tax_bar
#分组后样品的柱状图相对丰度
for i in city loc country;do
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/group_otus/taxa_summary_relative.${i}/otu_table_${i}_L2.txt -t Phylum -W 5 --name Phylum_tax_bar.${i}
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/group_otus/taxa_summary_relative.${i}/otu_table_${i}_L3.txt -t Class -W 5 --name Class_tax_bar.${i}
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/group_otus/taxa_summary_relative.${i}/otu_table_${i}_L4.txt -t Order -W 5 --name Order_tax_bar.${i}
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/group_otus/taxa_summary_relative.${i}/otu_table_${i}_L5.txt -t Family -W 5 --name Family_tax_bar.${i}
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/group_otus/taxa_summary_relative.${i}/otu_table_${i}_L6.txt -t Genus -W 5 --name Genus_tax_bar.${i}
tax_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/group_otus/taxa_summary_relative.${i}/otu_table_${i}_L7.txt -t Species -W 5 --name Species_tax_bar.${i}
done
#OTU number bar plot
out_num_bar_plot.r --outdir taxa_bar_plot -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.txt --name otu_number_bar_plot
#建议选择高丰度的物种进行统计
filter_otus_from_otu_table.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.biom -o taxa_bar_plot/otu_table.biom --min_count_fraction 0.001
biom convert -i taxa_bar_plot/otu_table.biom -o taxa_bar_plot/otu_table.txt --to-tsv --header-key taxonomy
out_num_bar_plot.r --outdir taxa_bar_plot -i taxa_bar_plot/otu_table.txt --name otu_number_bar_plot0.001
#OTU venn and flower plot
mkdir otu_venn_plot
#建议选择高丰度的物种进行venn图比较
for i in city loc country;do
filter_otus_from_otu_table.py -i $workdir/5.pick_otu_qiime/group_otus/otu_table_${i}.biom \
-o otu_venn_plot/otu_table_${i}.biom --min_count_fraction 0.0001
biom convert -i otu_venn_plot/otu_table_${i}.biom \
-o otu_venn_plot/otu_table_${i}.txt --to-tsv --header-key taxonomy
done
#过滤完之后绘图
for i in city loc country;do
### 分组小于等于5
venn_otu_plot.r --outdir otu_venn_plot -i otu_venn_plot/otu_table_${i}.txt \
-n ${i}.otu.venn
### 分组大于5
flower_plot.py --outdir otu_venn_plot --stat otu_venn_plot/${i}.otu.venn.flower.txt \
--prefix ${i}.otu.venn.flower
### 分组再多
upsetR_venn_plot.r -i otu_venn_plot/${i}.otu.venn.data.txt -o otu_venn_plot/ \
-n ${i}.otu.vennUpset
done
#heatmap plot Phylum, Class, Order, Family, Genus, Species
#选择相对丰度最高的前35个物种进行展示
abundance_heatmap.r --outdir abundance_heatmap_plot -T 35 -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L3.txt -g city loc country -f $fastmap -n Class_abundance_heatmap
abundance_heatmap.r --outdir abundance_heatmap_plot -T 35 -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L4.txt -g city loc country -f $fastmap -n Order_abundance_heatmap
abundance_heatmap.r --outdir abundance_heatmap_plot -T 35 -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L5.txt -g city loc country -f $fastmap -n Family_abundance_heatmap
abundance_heatmap.r --outdir abundance_heatmap_plot -T 35 -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L6.txt -g city loc country -f $fastmap -n Genus_abundance_heatmap
#abundance_heatmap.r --outdir abundance_heatmap_plot -T 35 -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L7.txt -g city loc country -f $fastmap -n Species_abundance_heatmap
#三元相图分析
ternary_plot.r -i $workdir/5.pick_otu_qiime/group_otus/taxa_summary_relative.city/otu_table_city_L3.txt \
-s PU GA SD --top 20 --name Ternary.city --outdir ternary_plot
###物种分类树图
#单个样本图中圆圈大小代表该物种在该水平所占的比重,圆圈下的数字,第一个表示只比对到该物种(不能
#比对到该级以下的水平)的序列数,第二个数字表示共有多少序列比对到该物种。
#多样本比较图中的饼图表示每个样本在该分类所占的相对百分比。圆圈下的数字,第一个表示只比对到该分
#类(不能比对到该分类等级以下的分类水平)的序列数,第二个数字表示共有多少序列比对到该分类。
level_tree_sample.py -f $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.txt -t 20 --outdir level_tree_plot
level_tree_group.py --file $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.txt --fastmap $fastmap --group city --top 20 --prefix level_tree_city --outdir level_tree_plot
level_tree_group.py --file $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.txt --fastmap $fastmap --group loc --top 20 --prefix level_tree_loc --outdir level_tree_plot
level_tree_group.py --file $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.txt --fastmap $fastmap --group country --top 20 --prefix level_tree_country --outdir level_tree_plot
#level_tree_sample.py -f $workdir/5.pick_otu_qiime/normalized_otu_css/CSS_normalized_otu_table.txt -t 100 --outdir level_tree_plot
#分类物种Krona 圈图
sh $scriptdir/OTUtable2KRONA.sh $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.txt Krona_pie/all_sample
sh $scriptdir/OTUtable2KRONA.sh $workdir/5.pick_otu_qiime/group_otus/otu_table_loc.txt Krona_pie/loc
sh $scriptdir/OTUtable2KRONA.sh $workdir/5.pick_otu_qiime/group_otus/otu_table_city.txt Krona_pie/city
sh $scriptdir/OTUtable2KRONA.sh $workdir/5.pick_otu_qiime/group_otus/otu_table_country.txt Krona_pie/country
#circos 圈图
circle_plot.pl -in $workdir/5.pick_otu_qiime/group_otus/taxa_summary_relative.city/otu_table_city_L2.txt -value 0.02 -od circos_plot
#### species_accumulation_curve.r
library(getopt)
spec <- matrix(c(
'help', 'h', 0, "logical", "help",
'input', 'i', 1, "character", "out table abs abundance, required.",
'outdir', 'o', 1, "character", "output directory for pic file, [optional, default: cwd]",
# 'xlab' , 'a', 1, "character","xlab [optional, default: Sample]",
# 'ylab' , 'b', 1, "character","ylab [optional, default: OTU Number]",
'name', 'n', 1, "character", "out file name prefix,[optional, default: species_accumulation_curve]",
'height', 'H', 1, "integer", "the height of graph ,unit is inches [optional, default: 7]",
'width', 'W', 1, "integer", "the width of graph ,unit is inches [optional, default: 7]"
), byrow = TRUE, ncol = 5)
opt <- getopt(spec)
print_usage <- function(spec = NULL) {
cat("")
cat(getopt(spec, usage = TRUE));
q(status = 1);
}
if (!is.null(opt$help) | is.null(opt$input)) {
print_usage(spec)
}
library(vegan)
if (!is.null(opt$help)) { print_usage(spec) }
if (is.null(opt$input)) { print_usage(spec) }
if (is.null(opt$outdir)) { opt$outdir = getwd() }
#if ( is.null(opt$xlab) ) { opt$xlab="Sample" }
#if ( is.null(opt$ylab) ) { opt$ylab="OTU Number" }
if (is.null(opt$name)) { opt$name = "species_accumulation_curve" }
if (is.null(opt$height)) { opt$height = 7 }
if (is.null(opt$width)) { opt$width = 7 }
#set some reasonable defaults for the options that are needed,
#but were not specified.
if (!file.exists(opt$outdir)) {
if (!dir.create(opt$outdir, showWarnings = FALSE, recursive = TRUE)) {
stop(paste("dir.create failed: outdir=", opt$outdir, sep = ""))
}
}
otu <- read.table(opt$input, row.names = 1, sep = "\t", header = F, check.names = FALSE)
max.otu <- nrow(otu)
otu <- otu[, -ncol(otu)]
otu <- t(otu)
min.otu <- min(rowSums(otu > 0))
all <- specaccum(otu, method = "random")
png(filename = paste(opt$outdir, "/", opt$name, ".png", sep = ""), height = opt$height * 300, width = opt$width * 300, res = 300, units = "px")
plot(all, ci.type = "poly", col = "blue", lwd = 2, ci.lty = 0,
ci.col = "lightblue", main = "Species Accumulation Curve", xlab = "Number of samples",
ylim = c(min.otu * 0.9, max.otu * 1.1),
ylab = "Observed species")
boxplot(all, col = "yellow", add = TRUE, pch = 20)
dev.off()
pdf(file = paste(opt$outdir, "/", opt$name, ".pdf", sep = ""), height = opt$height, width = opt$width)
plot(all, ci.type = "poly", col = "blue", lwd = 2, ci.lty = 0,
ci.col = "lightblue", main = "Species Accumulation Curve", xlab = "Number of samples",
ylim = c(min.otu * 0.9, max.otu * 1.1),
ylab = "Observed species")
boxplot(all, col = "yellow", add = TRUE, pch = 20)
dev.off()
### tax_bar_plot.r
require(ggplot2)
library(reshape2)
require(RColorBrewer)
library("stringr")
mycol = c("grey", brewer.pal(7, "Set2"), rev(brewer.pal(8, "Set1")), brewer.pal(8, "Set3"), brewer.pal(12, "Paired"), brewer.pal(8, "Dark2"), brewer.pal(7, "Accent"))
# reading data
#注意跳过第一行:skip=1
df <- read.table(opt$input, sep = "\t", skip = 1, header = TRUE, check.names = FALSE, comment.char = "@")
#处理第一列物种分类名字,没有物种名字的全部归为Other
spe.names <- str_split(df$`#OTU ID`, ";", simplify = T)
spe.name <- str_replace(spe.names[, ncol(spe.names)], "^\\w__", "")
spe.name[spe.name == "" | spe.name == "Other"] <- "Unknown"
df$`#OTU ID` <- spe.name
#df$`#OTU ID`
mydf <- df[df$`#OTU ID` != "Unknown",]
#求和 Unknown
if (nrow(mydf) == nrow(df)) { }else {
unknown <- data.frame(c("Unknown", colSums(data.frame(df[df$`#OTU ID` == "Unknown", 2:ncol(df)], check.names = F))), check.names = F)
rownames(unknown)[1] = colnames(df)[1]
df = rbind(mydf, t(unknown))
}
#排序
if (ncol(df) > 2) {
odf = df[order(rowSums(sapply(df[, 2:round(ncol(df)),], as.double)), decreasing = T),]
}else {
odf = df[order(df[, 2], decreasing = T),]
}
#合并计算Other,最多绘制15个物种的分类柱状图
if (nrow(df) > 15) {
xdf = odf[1:15,]
print(class(odf[16:nrow(odf), 2:ncol(odf)]))
other = data.frame(c("Other", colSums(data.frame(sapply(odf[16:nrow(odf), 2:ncol(odf)], as.double), check.names = F))), check.names = F)
rownames(other)[1] = colnames(df)[1]
xdf = rbind(xdf, t(other))
}else {
xdf = df
}
myorder <- xdf[, 1]
myorder <- setdiff(myorder, c("Other"))
#绘图数据准备
mdf <- melt(xdf, id.vars = colnames(xdf)[1])
colnames(mdf) <- c("OTU", "sample", "value")
#指定柱状图中分类顺序
mdf$OTU <- factor(mdf$OTU, levels = c("Other", rev(myorder)), ordered = T)
#绘图数据准备开始绘图
p <- ggplot(mdf, aes(sample, as.double(value))) +
geom_bar(aes(fill = OTU), stat = "identity") +
scale_fill_manual(values = mycol, name = opt$title) +
labs(y = opt$ylab, x = opt$xlab, fill = opt$title) +
coord_cartesian(ylim = c(0, 1), expand = FALSE)
p <- p +
theme_classic() +
theme(
# panel.grid=element_blank(),
axis.text.x = element_text(colour = "black", angle = 45, hjust = 1), axis.text.y = element_text(colour = "black")) +
theme(legend.key = element_blank())
pdf(file = paste(opt$outdir, "/", opt$name, ".pdf", sep = ""), height = opt$height, width = opt$width)
dev.off()
png(filename = paste(opt$outdir, "/", opt$name, ".png", sep = ""), height = opt$height * 300, width = opt$width * 300, res = 300, units = "px")
dev.off()
## out_num_bar_plot.r
require(ggplot2)
require(reshape2)
require(RColorBrewer)
library("stringr")
mycol <- brewer.pal(9, "Set1")
df <- read.table(opt$input, sep = "\t", skip = 1, header = TRUE, check.names = FALSE, comment.char = "@")
#df<-read.table("../5.qiime_analysis/group_otus/otu_table_loc.txt",sep="\t",skip=1,header = TRUE,check.names=FALSE,comment.char="@")
df <- df[, c(-1, -ncol(df))]
#样品中OTU的绝对丰度大于0都算含有这个OTU,计算样品中OTU数量
otu.num <- colSums(df >= opt$threshold)
otu.num <- c(otu.num[order(otu.num, decreasing = T)], All = nrow(df))
mydf <- data.frame(sample = names(otu.num), number = otu.num)
mydf$sample <- factor(mydf$sample, levels = mydf$sample, ordered = T)
p <- ggplot(mydf, aes(x = sample, y = number)) +
geom_bar(fill = mycol[1], stat = "identity", position = "dodge", width = 0.8) +
geom_text(aes(label = number), position = "stack", vjust = -0.5, size = 3) +
labs(y = opt$ylab, x = opt$xlab) +
scale_y_continuous(expand = c(0, 0)) +
coord_cartesian(ylim = c(0, max(mydf$number) * 1.1))
p <- p +
theme_classic() +
theme(
# panel.grid=element_blank(),
axis.text.x = element_text(colour = "black", angle = 45, hjust = 1), axis.text.y = element_text(colour = "black")) +
theme(legend.key = element_blank(), legend.title = element_blank())
pdf(file = paste(opt$outdir, "/", opt$name, ".pdf", sep = ""), height = opt$height, width = opt$width)
dev.off()
png(filename = paste(opt$outdir, "/", opt$name, ".png", sep = ""), height = opt$height * 300, width = opt$width * 300, res = 300, units = "px")
dev.off()
###
ibrary(RColorBrewer)
library(VennDiagram)
flog.threshold(ERROR)
mycol = brewer.pal(8, "Set1")
get_data_list <- function(str, names, out_file) {
uniq_all_id = c()
data_list = list()
for (i in 1:length(str)) {
#cat(infile,"\n")
mydata <- str[[i]]
uniq_all_id <- union(as.character(mydata), uniq_all_id)
data_list[[i]] <- mydata
i <- i + 1
}
aa = matrix(data = "NA", nrow = length(uniq_all_id), ncol = length(data_list) + 1, byrow = FALSE, dimnames = NULL)
#aa<-as.data.frame(aa)
aa[, 1] = as.character(uniq_all_id)
print(head(aa))
names(data_list) <- as.character(names)
for (i in 1:length(data_list)) {
for (j in 1:nrow(aa)) {
if (aa[j, 1] %in% as.character(data_list[[i]])) {
aa[j, i + 1] = aa[j, 1]
}else {
}
}
}
colnames(aa) = c("all_ID", names)
write.table(aa, file = out_file, quote = F, sep = '\t', row.names = F, col.names = T)
return(data_list)
}
venn5 <- function(data_list, file_name, main = "", h = 10, w = 10) {
library(VennDiagram)
pdf(paste(file_name, ".pdf", sep = ""), height = h, width = w)
venn.plot <- venn.diagram(
x = data_list,
filename = NULL,
#col = c("red","yellow","green","purple","salmon4"),
fill = c("deeppink", "lightpink", "peachpuff2", "khaki1", "darkseagreen2"),
alpha = 0.50,
cex = c(1.5, 1.5, 1.5, 1.5, 1.5, 1, 0.8, 1, 0.8, 1, 0.8, 1, 0.8,
1, 0.8, 1, 0.55, 1, 0.55, 1, 0.55, 1, 0.55, 1, 0.55, 1, 1, 1, 1, 1, 1.5),
cat.col = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3"),
cat.cex = 1.5,
cat.fontface = "bold",
margin = 0.05
);
grid.draw(venn.plot);
dev.off()
png(paste(file_name, ".png", sep = ""), height = h * 300, width = w * 300, res = 300)
venn.plot <- venn.diagram(
x = data_list,
filename = NULL,
#col = c("red","yellow","green","purple","salmon4"),
fill = c("deeppink", "lightpink", "peachpuff2", "khaki1", "darkseagreen2"),
alpha = 0.50,
cex = c(1.5, 1.5, 1.5, 1.5, 1.5, 1, 0.8, 1, 0.8, 1, 0.8, 1, 0.8,
1, 0.8, 1, 0.55, 1, 0.55, 1, 0.55, 1, 0.55, 1, 0.55, 1, 1, 1, 1, 1, 1.5),
cat.col = c("dodgerblue", "goldenrod1", "darkorange1", "seagreen3", "orchid3"),
cat.cex = 1.5,
cat.fontface = "bold",
margin = 0.05
);
grid.draw(venn.plot);
dev.off()
}
venn4 <- function(data_list, file_name, main = "", h = 10, w = 10) {
library(VennDiagram)
pdf(paste(file_name, ".pdf", sep = ""), height = h, width = w)
venn.plot <- venn.diagram(
x = data_list,
filename = NULL,
#col = c("yellow","green","purple","red"),
#lty = "dotted",
lwd = 2,
fill = c("darkseagreen2", "lightpink", "peachpuff2", "khaki1"),
#fill = c("darkorchid1", "darkorchid1", "darkorchid1", "darkorchid1"),
alpha = 0.80,
label.col = c("orange", "black", "darkorchid4", "black", "black", "black",
"black", "black", "darkblue", "black",
"black", "black", "black", "darkgreen", "black"),
cex = 1.5,
fontfamily = "serif",
fontface = "bold",
cat.col = c("darkblue", "darkgreen", "orange", "darkorchid4"),
cat.cex = 2,
cat.fontfamily = "serif"
);
grid.draw(venn.plot);
dev.off()
png(paste(file_name, ".png", sep = ""), height = h * 300, width = w * 300, res = 300)
venn.plot <- venn.diagram(
x = data_list,
filename = NULL,
#col = c("yellow","green","purple","red"),
#lty = "dotted",
lwd = 2,
fill = c("darkseagreen2", "lightpink", "peachpuff2", "khaki1"),
alpha = 0.80,
label.col = c("orange", "black", "darkorchid4", "black", "black", "black",
"black", "black", "darkblue", "black",
"black", "black", "black", "darkgreen", "black"),
cex = 1.5,
fontfamily = "serif",
fontface = "bold",
cat.col = c("darkblue", "darkgreen", "orange", "darkorchid4"),
cat.cex = 2,
cat.fontfamily = "serif"
);
grid.draw(venn.plot);
dev.off()
}
venn3 <- function(data_list, file_name, main = "", h = 10, w = 10) {
library(VennDiagram)
pdf(paste(file_name, ".pdf", sep = ""), height = h, width = w)
venn.plot <- venn.diagram(
x = data_list,
col = mycol[1:3],
filename = NULL,
output = TRUE,
#height = 3000,
#width = 3000,
resolution = 300,
compression = 'lzw',
units = 'px',
lwd = 3,
#lty = 'blank',
fill = NULL,
alpha = 0.80,
cex = 1.5,
#fontface = "bold",
fontfamily = "sans",
cat.cex = 2,
#cat.fontface = "bold",
cat.default.pos = "outer",
cat.col = mycol[1:3],
cat.pos = c(-27, 27, 135),
cat.dist = c(0.055, 0.055, 0.085),
cat.fontfamily = "sans",
rotation = 1
);
grid.draw(venn.plot);
dev.off()
png(paste(file_name, ".png", sep = ""), height = h * 300, width = w * 300, res = 300)
venn.plot <- venn.diagram(
x = data_list,
col = mycol[1:3],
filename = NULL,
output = TRUE,
# = 3000,
#width = 3000,
resolution = 300,
compression = 'lzw',
units = 'px',
lwd = 3,
#lty = 'blank',
fill = NULL, ,
alpha = 0.80,
cex = 1.5,
#fontface = "bold",
fontfamily = "sans",
cat.cex = 2,
#cat.fontface = "bold",
cat.default.pos = "outer",
cat.col = mycol[1:3],
cat.pos = c(-27, 27, 135),
cat.dist = c(0.055, 0.055, 0.085),
cat.fontfamily = "sans",
rotation = 1
);
grid.draw(venn.plot);
dev.off()
}
venn2 <- function(data_list, file_name, main = "", h = 10, w = 10) {
library(VennDiagram)
pdf(paste(file_name, ".pdf", sep = ""), height = h, width = w)
venn.plot <- venn.diagram(
x = data_list,
filename = NULL,
col = mycol[1:2],
lwd = 3,
fill = NULL,
alpha = 0.75,
label.col = "black",
cex = 1.5,
fontfamily = "serif",
fontface = "bold",
cat.col = mycol[1:2],
cat.cex = 2,
cat.fontfamily = "serif",
cat.fontface = "bold",
cat.dist = c(0.03, 0.03),
cat.pos = c(-20, 14)
);
grid.draw(venn.plot);
dev.off()
png(paste(file_name, ".png", sep = ""), height = h * 300, width = w * 300, res = 300)
venn.plot <- venn.diagram(
x = data_list,
col = mycol[1:2],
filename = NULL,
lwd = 3,
fill = NULL,
alpha = 0.75,
label.col = "black",
cex = 1.5,
fontfamily = "serif",
fontface = "bold",
cat.col = mycol[1:2],
cat.cex = 2,
cat.fontfamily = "serif",
cat.fontface = "bold",
cat.dist = c(0.03, 0.03),
cat.pos = c(-20, 14)
);
grid.draw(venn.plot);
dev.off()
}
venn1 <- function(data_list, file_name, main = "", h = 10, w = 10) {
library(VennDiagram)
pdf(paste(file_name, ".pdf", sep = ""), height = h, width = w)
venn.plot <- venn.diagram(
x = data_list,
col = mycol[1],
filename = NULL,
col = "black",
lwd = 2,
fontface = "bold",
fill = NULL,
cat.col = mycol[1],
alpha = 0.75,
cex = 1.5,
cat.cex = 2,
cat.fontface = "bold",
);
grid.draw(venn.plot);
dev.off()
png(paste(file_name, ".png", sep = ""), height = h * 300, width = w * 300, res = 300)
venn.plot <- venn.diagram(
x = data_list,
col = mycol[1],
filename = NULL,
col = "black",
lwd = 2,
fontface = "bold",
fill = NULL,
cat.col = mycol[1],
alpha = 0.75,
cex = 1.5,
cat.cex = 2,
cat.fontface = "bold",
);
grid.draw(venn.plot);
dev.off()
}
outersect <- function(l, core, x) {
big.vec <- x
duplicates <- big.vec[duplicated(big.vec)]
aa = setdiff(big.vec, unique(duplicates))
num = c()
for (i in names(l)) {
num = c(num, sum(l[[i]] %in% aa))
}
data.frame(group = c(names(l), "core"), num = c(num, core))
}
fnlist <- function(x, fil) {
z <- deparse(substitute(x))
cat(z, "\n", file = fil)
nams = names(x)
for (i in seq_along(x)) { cat(nams[i], "\t", x[[i]], "\n", file = fil, append = TRUE) }
}
################################################
##### venn_otu_plot.r
require(ggplot2)
require(reshape2)
require(RColorBrewer)
library("stringr")
library(plyr)
df <- read.table(opt$input, sep = "\t", row.names = 1, skip = 1, header = TRUE, check.names = FALSE, comment.char = "@")
#df<-read.table("../5.qiime_analysis/group_otus/otu_table_loc.txt",row.names=1,sep="\t",skip=1,header = TRUE,check.names=FALSE,comment.char="@")
df <- df[, c(-ncol(df))]
#样品中OTU的绝对丰度大于0都算含有这个OTU
#df[df<opt$threshold]<-NA
df[df < 1] <- NA
otu.id <- rownames(df)
map.id <- function(x) {
x[!is.na(x)] <- otu.id[!is.na(x)]
x
}
mydf <- sapply(df, map.id)
write.table(mydf, file = paste(opt$outdir, "/", opt$name, ".data.txt", sep = ""), quote = F, sep = '\t', row.names = F, col.names = T)
mydf <- sapply(as.data.frame(mydf), na.omit)
samn <- length(mydf)
coreID = Reduce(intersect, mydf)
flower = outersect(mydf, length(coreID), unlist(mydf))
write.table(flower, file = paste(opt$outdir, "/", opt$name, ".flower.txt", sep = ""), quote = F, sep = '\t', row.names = F, col.names = T)
###samn 对应上面的 venn1-5函数
if (samn >= 2 && samn <= 5) {
myvenn <- get(paste0("venn", samn))
myvenn(mydf, paste(opt$outdir, "/", opt$name, sep = ""), h = opt$height, w = opt$width)
}else {
cat("more than 5 group, venn not plot\n")
}
flower.plot <- function(flower) {
}
7.OTU或者物种差异统计分析
cd $workdir
mkdir 7.otu_diff_stat
cd 7.otu_diff_stat
############################metastat######################################
#输入标准化的结果做差异分析,或者CSS标准化的结果做差异分析
#metastat
for i in {2..6};do
echo meta_stat.r -i $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L$i.txt \
-f $fastmap -t city \
-a PU -b GA \
-o meta_stat --name PU_vs_GA_L$i
done
#####################lefse LDA判别分析########
#先分大组再分亚组:--split country -cl loc
#biom 文件会自动转化为相对丰度进行差异比较
koeken.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.biom \
-m $fastmap -l 3 -pc --split country -cl loc -str 0 \
--effect 2 --pval 0.05 -dp 300 -o lefse/country_loc
#不想分小组,fastmap添加一列所有样品分组都一样
#more_strict
koeken.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.biom \
-m $fastmap -l 3 -pc --split Description -cl loc -str 0 \
--effect 2 --pval 0.05 -dp 300 -o lefse/loc_more_strict
#less_strict
koeken.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.biom \
-m $fastmap -l 3 -pc --split Description -cl loc -str 1 \
--effect 2 --pval 0.05 -dp 300 -o lefse/loc_less_strict
#########################STAMP############
#STAMP 使用标准化后的数据来做
for i in {2..7};do
filter_otus_from_otu_table.py --min_count_fraction 0.001 -i $workdir/5.pick_otu_qiime/taxa_summary/absolute_abundance/otu_table_clean_rare_L$i.biom -o otu_table_clean_rare_L$i.biom
biom_to_stamp.py otu_table_clean_rare_L$i.biom >otu_table_stamp_L$i.spf
done
#########################R包 deseq2 #######
mkdir deseq2
#deseq差异分析需要输入原始的绝对丰度表格,不要做任何标准化处理的otu table;
differential_abundance.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean.biom -o deseq2/France_vs_Germany_diff_otus.txt -m $fastmap -a DESeq2_nbinom -c country -x France -y Germany --DESeq2_diagnostic_plots
volcano_and_MA_plot.r -i deseq2/France_vs_Germany_diff_otus.txt --name France_vs_Germany_diff_otus -o deseq2/
#不同分类水平差异分析
#differential_abundance.py -i $workdir/5.pick_otu_qiime/taxa_summary/absolute_abundance/otu_table_clean_L6.biom -o deseq2/France_vs_Germany_diff_L6.txt -m $fastmap -a DESeq2_nbinom -c country -x France -y Germany --DESeq2_diagnostic_plots
#volcano_and_MA_plot.r -i deseq2/France_vs_Germany_diff_L6.txt --name France_vs_Germany_diff_L6 -o deseq2/
## koeken.py
import os
import os.path
import shlex
import sys
import glob
import argparse
import subprocess
import re
import logging
import qiime
import pandas as pd
def get_args():
parser = argparse.ArgumentParser(
description='Performs Linear Discriminant Analysis (LEfSe) on A Longitudinal Dataset.', add_help=True)
parser.add_argument('-v', '--version', action='version', version=__version__)
parser.add_argument('-d', '--debug', action='store_true', dest="debug", help='Enable debugging', default=False)
parser.add_argument('-i', '--input', action="store", dest="input_biom",
help='Location of the OTU Table for main analysis. (Must be .biom format)', required=True,
type=str)
parser.add_argument('-o', '--output', action="store", dest="outputDir",
help='Location of the folder to place all resulting files. If folder does not exist, the program will create it.',
required=True, type=str)
parser.add_argument('-m', '--map', action="store", dest="map_fp",
help='Location of the mapping file associated with OTU Table.', required=True, type=str)
parser.add_argument('-l', '--level', action="store", dest="level", default=6,
help='Level for which to use for summarize_taxa.py. [default = 6]', type=int,
choices=[2, 3, 4, 5, 6, 7])
parser.add_argument('-cl', '--class', action="store", dest="classid",
help='Location of the OTU Table for main analysis. (Must be .biom format)', required=True,
type=str)
parser.add_argument('-sc', '--subclass', action="store", dest="subclassid", default='NA',
help='Directory to place all the files.', type=str)
parser.add_argument('-su', '--subject', action="store", dest="subjectid", default='#SampleID',
help='Only change if your Sample-ID column names differs. [default] = #SampleID.', type=str)
"""Arguments for LEfSe discriminant analysis"""
parser.add_argument('-p', '--pval', action="store", dest="p_cutoff", default=0.05,
help='Change alpha value for the Anova test (default 0.05)')
parser.add_argument('-e', '--effect', action="store", dest="lda_cutoff", default=2,
help='Change the cutoff for logarithmic LDA score (default 2.0).', type=float)
parser.add_argument('-str', '--strict', action="store", dest="strictness", default=0, choices=[0, 1],
help='Change the strictness of the comparisons. Can be changed to less strict (1). [default = 0](more-strict).',
type=int)
parser.add_argument('-c', '--compare', action="store", dest="compare", default="", nargs='+',
help='Which groups should be kept to be compared against one another. [default = all factors]',
required=False, type=str)
parser.add_argument('-sp', '--split', action="store", dest="split", default="NA",
help='The name of the timepoint variable in you mapping file. This variable will be used to split the table for each value in this variable.',
required=True, type=str)
parser.add_argument('-pc', '--clade', action="store_true", dest="clade",
help='Plot Lefse Cladogram for each output time point. Outputs are placed in a new folder created in the lefse results location.',
default=False)
parser.add_argument('-s', '--rmtaxa', action="store_true", dest="remove_taxa_name",
help='whether Remove greengenes taxa prefix names to make it prettier', default=False)
parser.add_argument('-it', '--image', action="store", dest="image", type=str,
help='Set the file type for the image create when using cladogram setting', default='pdf',
choices=["png", "pdf", "svg"])
parser.add_argument('-dp', '--dpi', action="store", dest="dpi", type=int, help='Set DPI resolution for cladogram',
default=300)
parser.add_argument('-pi', '--picrust', action="store_true", dest="picrust",
help='Run analysis with PICRUSt biom file. Must use the cateogirze by function level 3. Next updates will reflect the difference levels.',
default=False)
return parser.parse_args()
def main(args):
input_biom = args.input_biom
output_dir = args.outputDir
map_fp = args.map_fp
level = args.level
classid = args.classid
subclassid = args.subclassid
subjectid = args.subjectid
compare = args.compare
split = args.split
p_cutoff = args.p_cutoff
lda_cutoff = args.lda_cutoff
strictness = args.strictness
clade = args.clade
image = args.image
dpi = args.dpi
"""Check to see if output directories exist or not and create them."""
if not os.path.exists(output_dir):
os.makedirs(output_dir)
else:
logging.warning(
'Output folder already exists. Warning: Errors may be produced. Please delete or change output folder before running again!.' + '\n')
logging.basicConfig(level=logging.INFO,
format='%(asctime)s %(levelname)-8s %(message)s',
datefmt='%a, %d %b %Y %H:%M:%S',
filename=output_dir + '/log.out',
filemode='w')
"""Output location from summarize_taxa.py step"""
sumtaxa_dir = '{}/{}{}/'.format(output_dir, "summarize_taxa_L", str(level))
if not os.path.exists(sumtaxa_dir):
os.makedirs(sumtaxa_dir)
lefse_dir = '{}/{}'.format(output_dir, "lefse_output")
if not os.path.exists(lefse_dir):
os.makedirs(lefse_dir)
format_dir = '{}/{}/'.format(lefse_dir, "format_lefse")
if not os.path.exists(format_dir):
os.makedirs(format_dir)
run_dir = '{}/{}/'.format(lefse_dir, "run_lefse")
if not os.path.exists(run_dir):
os.makedirs(run_dir)
if clade == True:
# clado_dir = '{}/{}/'.format(run_dir, "cladograms")
clado_dir = run_dir
if not os.path.exists(clado_dir):
os.makedirs(clado_dir)
if args.picrust:
summarize_cmd = "summarize_taxa.py -i %s -o %s -m %s -L 3 -d '|' --md_identifier 'KEGG_Pathways' " % (
input_biom, sumtaxa_dir, map_fp)
else:
summarize_cmd = "summarize_taxa.py -i %s -o %s -m %s -L %d -d '|'" % (input_biom, sumtaxa_dir, map_fp, level)
subprocess.call(shlex.split(summarize_cmd))
"""Get filename of generated summarize_taxa.py outputs"""
if args.picrust:
sumtaxa_loc = glob.glob(sumtaxa_dir + '*_L3' + '.txt')
else:
sumtaxa_loc = glob.glob(sumtaxa_dir + '/*_L' + str(level) + '.txt')
"""Create panda dataframes from summarize_taxa output file and mapping file"""
sumtaxa_df = pd.read_table(sumtaxa_loc[0])
map_df = pd.read_table(map_fp)
subjectID_pos = sumtaxa_df.columns.get_loc(subjectid)
classID_pos = sumtaxa_df.columns.get_loc(classid)
bacteria_pos = range((len(map_df.columns)), len(sumtaxa_df.columns)) # Find the number of cols in mapping file
"""Cases for addition of subject class"""
if subclassid == "NA":
to_keep = [subjectID_pos, classID_pos] + bacteria_pos
else:
subclassID_pos = sumtaxa_df.columns.get_loc(subclassid)
to_keep = [subjectID_pos, classID_pos, subclassID_pos] + bacteria_pos
"""Subset the data if particular group comparisons are given"""
if compare != "":
sumtaxa_df = sumtaxa_df[sumtaxa_df[classid].isin(compare)]
""" Remove greengenes taxa names to makeit prettier """
if args.remove_taxa_name:
sumtaxa_df = sumtaxa_df.rename(columns=lambda x: re.sub('.__', '', x))
sumtaxa_df = sumtaxa_df.rename(columns=lambda x: re.sub(' ', '_', x))
"""For each timepoint, remove unwanted columns and perform LEfSe analysis"""
grouped_df = sumtaxa_df.groupby(str(split))
for name, group in grouped_df:
table = group.iloc[:, to_keep].transpose()
table_filtered = table.loc[~(table == 0).all(axis=1)]
table_out = '{}{}{}'.format(sumtaxa_dir, name, '_input.txt')
table_filtered.to_csv(table_out, sep='\t', header=False, index=True)
"""Run format_input.py from LEfSe package"""
format_file_out = format_dir + os.path.basename(table_out).replace('_input.txt', '_format.txt')
if subclassid == "NA":
subprocess.call(" ".join(
["export export PATH=/biosoft/miniconda/envs/lefse/bin/:$PATH;", 'lefse-format_input.py', table_out,
format_file_out, '-u 1', '-c 2', '-o 1000000', '-f', 'r']), shell=True)
logging.info(
" ".join(['format_input.py', table_out, format_file_out, '-u 1', '-c 2', '-o 1000000', '-f', 'r']))
else:
subprocess.call(" ".join(
["export export PATH=/biosoft/miniconda/envs/lefse/bin/:$PATH;", 'lefse-format_input.py', table_out,
format_file_out, '-u 1', '-c 2', '-s 3', '-o 1000000', '-f', 'r']), shell=True)
logging.info(" ".join(
['format_input.py', table_out, format_file_out, '-u 1', '-c 2', '-s 3', '-o 1000000', '-f', 'r']))
run_file_out = run_dir + os.path.basename(format_file_out).replace('_format.txt', '.txt')
subprocess.call(" ".join(
["export export PATH=/biosoft/miniconda/envs/lefse/bin/:$PATH;", 'run_lefse.py', format_file_out,
run_file_out, '-a', str(p_cutoff), '-l', str(lda_cutoff), '-y', str(strictness)]), shell=True)
"""Check to see if cladogram option was chosen"""
if clade == True:
"""Run plot_cladogram.py from LEfSe package"""
# clade_file_out = clado_dir + os.path.basename(format_file_out).replace('_format.txt', '.' + image)
clade_file_out = clado_dir + os.path.basename(format_file_out).replace('_format.txt', '_cladogram.')
res_file_out = clado_dir + os.path.basename(format_file_out).replace('_format.txt', '_res.')
# subprocess.call(['plot_cladogram.py', run_file_out, clade_file_out, '--format', image, '--dpi', str(dpi), '--title', str(name)])
subprocess.call(" ".join(
["export PATH=/biosoft/miniconda/envs/lefse/bin/:$PATH;", 'lefse-plot_cladogram.py', run_file_out,
clade_file_out + 'png', '--format', 'png', '--dpi', str(dpi), '--title', str(name)]), shell=True)
subprocess.call(" ".join(
["export PATH=/biosoft/miniconda/envs/lefse/bin/:$PATH;", 'lefse-plot_cladogram.py', run_file_out,
clade_file_out + 'pdf', '--format', 'pdf', '--dpi', str(dpi), '--title', str(name)]), shell=True)
subprocess.call(" ".join(
["export PATH=/biosoft/miniconda/envs/lefse/bin/:$PATH;", 'lefse-plot_res.py', run_file_out,
res_file_out + 'png', '--format', 'png', '--dpi', str(dpi), '--title', str(name)]), shell=True)
subprocess.call(" ".join(
["export PATH=/biosoft/miniconda/envs/lefse/bin/:$PATH;", 'lefse-plot_res.py', run_file_out,
res_file_out + 'pdf', '--format', 'pdf', '--dpi', str(dpi), '--title', str(name)]), shell=True)
if __name__ == '__main__':
args = get_args()
map_chk = pd.read_table(args.map_fp)
if str(args.classid) not in map_chk.columns.values.tolist():
raise ValueError(
'Warning. There is no class variable with that column name in your mapping file. Please verify the class ID chosen is actually a column name in your mapping file.')
if str(args.split) not in map_chk.columns.values.tolist():
raise ValueError(
'Warning. There is no split variable with that column name in your mapping file. Please verify the subclass ID chosen is actually a column name in your mapping file.')
if (args.compare) != "":
if (len(args.compare) == 1):
raise ValueError(
"Warning. Comparison needs to be in the format of Group1 Group2. Do not use any separators.")
main(args)
8.alpha_diversity 多样性分析
cd $workdir #回到工作目录
mkdir 8.alpha_diversity
cd 8.alpha_diversity
#Alpha稀释曲线,比较alpha多样性
biom summarize-table -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean.biom
echo alpha_diversity:metrics observed_species,PD_whole_tree,shannon,chao1,simpson,goods_coverage > alpha_params.txt
alpha_rarefaction.py -f -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean.biom \
-m $fastmap -o ./ -p alpha_params.txt \
-t $workdir/5.pick_otu_qiime/pick_de_novo_otus/rep_set.tre \
--min_rare_depth 40 --max_rare_depth 2030 --num_steps 10
#看测序量是否足够,抽平到最大值:
echo alpha_diversity:metrics observed_species,PD_whole_tree,shannon,chao1,simpson,goods_coverage > alpha_params.txt
alpha_rarefaction.py -f -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean.biom \
-m $fastmap -o ./alpha_rarefaction_check -p alpha_params.txt \
-t $workdir/5.pick_otu_qiime/pick_de_novo_otus/rep_set.tre \
--min_rare_depth 40 --max_rare_depth 100000 --num_steps 10
#所有样品物种等级分布曲线
plot_rank_abundance_graph.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.biom -s '*' -x -f pdf -v -o all_rank
#单个样品物种等级分布曲线
plot_rank_abundance_graph.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.biom -s 'ERR3975187' -x -f pdf -v -o ERR3975187_rank
#alpha多样性指数展示
#可选指数:'ace','berger_parker_d','brillouin_d','chao1','chao1_ci','dominance','doubles','enspie','equitability','esty_ci','fisher_alpha','gini_index','goods_coverage','heip_e','kempton_taylor_q','margalef','mcintosh_d','mcintosh_e','menhinick','michaelis_menten_fit','observed_otus','observed_species','osd','simpson_reciprocal','robbins','shannon','simpson','simpson_e','singles','strong','PD_whole_tree'
alpha_diversity.py -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.biom -m observed_species,PD_whole_tree,shannon,chao1,simpson,goods_coverage,ace -o alpha_div_index.txt -t $workdir/5.pick_otu_qiime/pick_de_novo_otus/rep_set.tre
#差异比较qiime 自带检验与绘图
compare_alpha_diversity.py \
-i alpha_div_collated/chao1.txt \
-o alpha_chao1_stats \
-m $fastmap \
-t nonparametric \
-c city,loc,country
compare_alpha_diversity.py \
-i alpha_div_collated/chao1.txt \
-o alpha_chao1_stats \
-m $fastmap \
-t parametric \
-c city,loc,country
#R语言检验 t.test wilcox and anova test 与 boxplot
for i in goods_coverage observed_species PD_whole_tree shannon simpson;do
for g in loc city country;do
alpha_compare_stat.r -i alpha_div_index.txt --index $i \
--group $g -f $fastmap -o alpha_diversity_stat
alpha_compare_plot.r -i alpha_div_index.txt -I $i -f $fastmap \
-g $g -G boxplot jitter -o alpha_diversity_stat --name "${i}_${g}_boxplot"
done
done
9.beta 多样性
cd $workdir #回到工作目录
mkdir 9.beta_diversity
cd 9.beta_diversity
echo beta_diversity:metrics binary_jaccard,bray_curtis,unweighted_unifrac,weighted_unifrac,binary_euclidean > beta_params.txt
#--metrics: invalid choice: 'euclidian' (choose from 'abund_jaccard','binary_chisq','binary_chord','binary_euclidean','binary_hamming','binary_jaccard','binary_lennon','binary_ochiai','binary_otu_gain','binary_pearson','binary_sorensen_dice','bray_curtis','bray_curtis_faith','bray_curtis_magurran','canberra','chisq','chord','euclidean','gower','hellinger','kulczynski','manhattan','morisita_horn','pearson','soergel','spearman_approx','specprof','unifrac','unifrac_g','unifrac_g_full_tree','unweighted_unifrac','unweighted_unifrac_full_tree','weighted_normalized_unifrac','weighted_unifrac'
#使用CSS标准的结果做beta多样性分析:
beta_diversity_through_plots.py -f -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_css.biom -m $fastmap -o ./ -t $workdir/5.pick_otu_qiime/pick_de_novo_otus/rep_set.tre -p beta_params.txt
#重抽样分析,丢掉信息较多,不建议使用
#beta_diversity_through_plots.py -f -i $workdir/5.pick_otu_qiime/pick_de_novo_otus/otu_table_clean_rare.biom -m $fastmap -o ./ -t $workdir/5.pick_otu_qiime/pick_de_novo_otus/rep_set.tre -p beta_params.txt
#qiime 自带绘图显示 beta多样性 2D
for i in binary_euclidean binary_jaccard bray_curtis unweighted_unifrac weighted_unifrac;do
make_2d_plots.py -i ${i}_pc.txt -m $fastmap -o pcoa_2d_plots/
done
#nmds分析
for i in binary_euclidean binary_jaccard bray_curtis unweighted_unifrac weighted_unifrac;do
nmds.py -i ${i}_dm.txt -o ${i}_nmds.txt -d 3
done
#样品聚类分析
for i in binary_euclidean binary_jaccard bray_curtis unweighted_unifrac weighted_unifrac;do
upgma_cluster.py -i ${i}_dm.txt -o ${i}_upgma_cluster.nwk
done
#绘图结果展示 PCoA NMDS tree plot:
for i in loc city country;do
for j in binary_euclidean binary_jaccard bray_curtis unweighted_unifrac weighted_unifrac;do
pcoa_plot.r -i ${j}_pc.txt -f $fastmap -g $i --name ${j}_${i}_pc --title $j -o pcoa_plot
nmds_plot.r -i ${j}_nmds.txt -f $fastmap -g $i --name ${j}_${i}_nmds --x.lab NMDS1 --y.lab NMDS2 -o nmds_plot
cluster_tree_plot.r --tree ${j}_upgma_cluster.nwk --group $i --fastmap $fastmap -n ${j}_${i}_cluster_tree -o cluster_tree_plot
done
done
#tree and tax bar plot
for i in binary_euclidean binary_jaccard bray_curtis unweighted_unifrac weighted_unifrac;do
echo cluster_tree_and_bar_plot.r --title Phylum \
--Xlab Relative_abundance --xlab ${i}_Distance \
--tree ${i}_upgma_cluster.nwk --group country \
--fastmap $fastmap -n ${i}_cluster_tree_bar \
-o cluster_tree_bar_plot \
--tax $workdir/5.pick_otu_qiime/taxa_summary/relative_abundance/otu_table_clean_rare_L2.txt
done
#heat map
for j in binary_euclidean binary_jaccard bray_curtis unweighted_unifrac weighted_unifrac;do
beta_div_heatmap.r -i ${j}_dm.txt -f $fastmap --group city loc country --name ${j}_heatmap
done
# beta div stats and box plot
#检验显著性结果(p值),还有程度结果(R值),R值可以用来判断分组贡献度大小。
#应用举例:
for f in binary_euclidean binary_jaccard bray_curtis unweighted_unifrac weighted_unifrac;do
for i in city loc country;do
for j in adonis mrpp anosim;do
compare_categories.py -i ${f}_dm.txt -o beta_div_stats/${f}_${j}_${i}_stat -m $fastmap -c $i --method $j
#Compare within vs between b-diversity.
make_distance_boxplots.py -d ${f}_dm.txt -o beta_div_stats/${f}_${j}_${i}_boxplot -m $fastmap -f $i --save_raw_data
done
done
done
#cpcoa 限制性主成分分析:
#环境梯度3个以上,
for i in binary_euclidean binary_jaccard bray_curtis unweighted_unifrac weighted_unifrac;do
for j in city loc;do
beta_cpcoa_and_plot.r -i ${i}_dm.txt -f $fastmap -g $j --size 6 --alpha 0.5 -e -L --name ${i}_${j}_cpcoa -o cpcoa_analysis_plot
done
done
10.功能注释
cd $workdir #回到工作目录
mkdir 10.function_picrust
cd 10.function_picrust
echo pick_otus:similarity 0.97 > otu_params.txt
echo pick_otus:otu_picking_method usearch61_ref >> otu_params.txt #uclust_ref,usearch_ref, usearch61_ref
echo assign_taxonomy:similarity 0.8 >>otu_params.txt
echo assign_taxonomy:assignment_method uclust >>otu_params.txt # rdp, blast,rtax, mothur, uclust, sortmerna #如果是ITS/18S数据,数据库更改为UNITE,assignment_method方法改为blast。详细使用说明,请读官方文档http://qiime.org/scripts/assign_taxonomy.html
#对数据进行标准化与过滤
pick_closed_reference_otus.py -r $greengene_16S_97_seq \
-t $greengene_16S_97_tax -i $workdir/5.pick_otu_qiime/qiime.fasta -f \
-o pick_closed_reference_otus -p otu_params.txt -s
filter_otus_from_otu_table.py -i pick_closed_reference_otus/otu_table.biom \
-o pick_closed_reference_otus/otu_table_filtered.biom --min_count_fraction 0.00001 \
--negate_ids_to_exclude -e $greengene_16S_97_seq
sort_otu_table.py -i pick_closed_reference_otus/otu_table_filtered.biom \
-o pick_closed_reference_otus/closed_otu_table.biom
single_rarefaction.py -i pick_closed_reference_otus/closed_otu_table.biom \
-o pick_closed_reference_otus/closed_otu_table_rare.biom -d 1604
biom convert -i pick_closed_reference_otus/closed_otu_table_rare.biom \
-o pick_closed_reference_otus/closed_otu_table_rare.txt --to-tsv --header-key taxonomy
#校正拷贝数
normalize_by_copy_number.py -i pick_closed_reference_otus/closed_otu_table_rare.biom \
-o normalized_otus.biom \
-c $dbdir/picrust_data/16S_13_5_precalculated.tab.gz
biom convert -i normalized_otus.biom -o normalized_otus.txt --to-tsv --header-key taxonomy
#KEGG功能预测
predict_metagenomes.py -i normalized_otus.biom -o predictions_kegg.biom -c $dbdir/picrust_data/ko_13_5_precalculated.tab.gz
biom convert -i predictions_kegg.biom --to-tsv --header-key KEGG_Description -o predictions_kegg.txt
#COG功能预测
predict_metagenomes.py --type_of_prediction cog -i normalized_otus.biom -o predictions_cog.biom -c $dbdir/picrust_data/cog_13_5_precalculated.tab.gz
biom convert -i predictions_cog.biom --to-tsv --header-key COG_Description -o predictions_cog.txt
#KEGG功能预测结果分类汇总
categorize_by_function.py -i predictions_kegg.biom -c "KEGG_Pathways" -l 1 -o predictions_kegg_at_L1.biom
categorize_by_function.py -i predictions_kegg.biom -c "KEGG_Pathways" -l 2 -o predictions_kegg_at_L2.biom
categorize_by_function.py -i predictions_kegg.biom -c "KEGG_Pathways" -l 3 -o predictions_kegg_at_L3.biom
biom convert -i predictions_kegg_at_L1.biom --to-tsv --header-key KEGG_Pathways -o predictions_kegg_at_L1.txt
biom convert -i predictions_kegg_at_L2.biom --to-tsv --header-key KEGG_Pathways -o predictions_kegg_at_L2.txt
biom convert -i predictions_kegg_at_L3.biom --to-tsv --header-key KEGG_Pathways -o predictions_kegg_at_L3.txt
#COG功能预测结果分类汇总
categorize_by_function.py -i predictions_cog.biom -c "COG_Category" -l 1 -o predictions_cog_at_L1.biom
categorize_by_function.py -i predictions_cog.biom -c "COG_Category" -l 2 -o predictions_cog_at_L2.biom
biom convert -i predictions_cog_at_L1.biom --to-tsv --header-key COG_Category -o predictions_cog_at_L1.txt
biom convert -i predictions_cog_at_L2.biom --to-tsv --header-key COG_Category -o predictions_cog_at_L2.txt
#关注的功能有哪些细菌,所有结果此文件很大,需要自行运行得到结果
#metagenome_contributions.py -i normalized_otus.biom -o ko_metagenome_contributions.xls -c $dbdir/picrust_data/ko_13_5_precalculated.tab.gz
#metagenome_contributions.py -i normalized_otus.biom -t cog -o cog_metagenome_contributions.xls -c $dbdir/picrust_data/cog_13_5_precalculated.tab.gz
#只导出关心的功能
metagenome_contributions.py -i normalized_otus.biom \
-o ko_metagenome_contributions1.xls -c $dbdir/picrust_data/ko_13_5_precalculated.tab.gz \
-l K00001,K00002,K00004
metagenome_contributions.py -i normalized_otus.biom -t cog \
-o cog_metagenome_contributions1.xls -c $dbdir/picrust_data/cog_13_5_precalculated.tab.gz \
-l COG0001,COG0002
#利用qiime软件 汇总统计并可视化分析
echo summarize_taxa:md_identifier "KEGG_Pathways" >sum_taxa_picrust.txt
echo summarize_taxa:absolute_abundance True >> sum_taxa_picrust.txt #注释掉即为相对丰度
echo summarize_taxa:level 1,2,3 >>sum_taxa_picrust.txt
summarize_taxa_through_plots.py -f -i predictions_kegg.biom -p sum_taxa_picrust.txt -o summarize_kegg_through_plots
#alpha 多样性分析,统计的是注释到的每个样品中KO的数目
alpha_diversity.py -i predictions_kegg.biom -m observed_species,shannon,chao1,simpson,goods_coverage,ace -o alpha_div_index.txt
#beta多样性分析,反应不同样品中功能组成的差异,没有进化树无法计算unifrac
echo beta_diversity:metrics binary_jaccard,bray_curtis,binary_euclidean > beta_params.txt
beta_diversity_through_plots.py -f -i predictions_kegg.biom -m $fastmap -o beta_diversity_through_plots -p beta_params.txt
#转换成STAMP格式方便STAMP 差异统计分析
biom_to_stamp.py -m KEGG_Pathways predictions_kegg_at_L2.biom > predictions_kegg_at_L2.spf
biom_to_stamp.py -m KEGG_Pathways predictions_kegg_at_L3.biom > predictions_kegg_at_L3.spf
biom_to_stamp.py -m KEGG_Description predictions_kegg.biom > predictions_kegg.spf
biom_to_stamp.py predictions_cog_at_L1.biom > predictions_cog_at_L1.spf
biom_to_stamp.py predictions_cog_at_L2.biom > predictions_cog_at_L2.spf
biom_to_stamp.py predictions_cog.biom > predictions_cog.spf
#Lefse差异分析
koeken.py -i predictions_kegg_at_L3.biom -m $fastmap -l 3 -pc --split Description -cl loc -str 1 \
--effect 3 --pval 0.05 -dp 300 -o lefse/loc_less_strict -pi