05植物线粒体流程3-高级分析
一.数据准备
1.1 ref下载
## 复制表格中的物种 至 tmp_ad/tree/cp文件(以+号为分割)
# python3 /share/nas1/yuj/script/chloroplast/download_genome_from_ncbi.py -f gb -a list_tre # 备用下载
编号目录]
touch tmp_{ad,tree,cp}
awk -F "+" '{ print $2 }' tmp_ad | awk '$1=$1' > list_ad && cat list_ad && down- ad &
awk -F "+" '{ print $2 }' tmp_tree | awk '$1=$1' > list_tree && cat list_tree && down- tree &
awk -F "+" '{ print $2 }' tmp_cp | awk '$1=$1' > list_cp && cat list_cp && down- cp &
mv ref_* analysis/
1.2 测序样品及ref处理(多序列物种)
需要合并多序列物种的gb文件,变成单序列,方便后续分析
1.对样品进行处理,合并gene_anno中单个gb(1.gb、2.gb、3.gb等)文件到上一级目录analysis/annotation/Polygala_tenuifolia_Willd中(组装物种多序列时使用)
perl /share/nas6/xul/program/mt2/annotation/pipline/src/merge_multi_chr_mt_gbk.pl -i gene_anno/*.gbk > `basename *.fasta .fasta`.gbk
!!!不能有相同的物种名 !!! ???
--------- 先看2.0章节检查物种
2.对高级分析和进化树物种处理
!!!只能一个一个物种处理!!!
## 合并高级分析中多环线粒体的gb文件(高级分析物种多序列时使用)
perl /share/nas6/xul/program/mt2/annotation/pipline/src/merge_multi_chr_mt_gbk.pl -i *.gbk > merge.gbk
## fasta2中的多序列要cat到一个文件中,然后重命名
比如说参考A有两条序列都在fasta中
cat fasta2/* > merge.fasta
每一个合并后的fa文件里面都是多条序列,然后用下面的命令对多序列重命名
for j in *.fasta ;do perl -i -lane 'if(/^>/){if($. == 1){print "$_","_",++$i}else{print ">",++$i}}else{print}' $j ; done
1.3 改名
## 给gbk改名
GP-XXX]$ cd ref_adv/gbk && rename .gbk .1.gbk * && cd - # 加上.1,一般不需要
GP-XXX]$ cd ref_adv/fasta && for i in *.fasta;do rename `awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print ""$1""".1" }'` `awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print $2"""_"""$1""".1" }'` ../gbk/""`awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print ""$1""".1" }'`.gbk ;done && cd ../../
## 给fasta改名
GP-XXX]$ cd ref_adv/fasta && rename .fasta .1.fasta * && cd - # 加上.1,一般不需要
GP-XXX]$ cd ref_adv/fasta && for i in *.fasta;do rename `awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print ""$1""".1" }'` `awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print $2"""_"""$1""".1" }'` ../fasta/""`awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print ""$1""".1" }'`.fasta ;done && cd ../../
二.共有CDS进化树
2.0 检查进化树物种
prefix=`basename assembly/*/*.fasta .fasta` # 设置下前缀名
cd ref_tree/gbk
grep DEFI * # 看下是不是都是完整的
grep -c " CDS" * # 确保都有注释
grep DEFI * | grep chrom # 看下有没有把一个线粒体的多序列分好几个gbk,有的话需要全部下下来,用上面的ref合并处理
2.1 ML
cd analysis
mkdir phytree && cd phytree && mkdir gbk/ && cp ../annotation/*/*.gbk ../ref_tree/gbk/*.gbk gbk # 这里用的单序列
1.提取基因比对
perl /share/nas6/xul/program/mt2/phytree/gene_tree/find_total_gene.pl -i gbk/ -o gene -p $prefix && \
## 可以不管,一般不考虑
for i in gene/mafft/*.fasta ; do echo $i; perl /share/nas6/xul/program/chloroplast/pi/compute_pi_use_aln_fasta.pl -i $i | grep Nucleotide ;done && \ # 可以先运行这一步,看一下pi值,把异常的基因从其物种gbk中删除
## 可以不管,一般不考虑
## 查看下数量,例子如下
ls gbk/ |wc
31 31 501 # 一共31个物种
## 看下每个基因是否都在31个物种里
grep -c ">" gene/mafft/*
gene/mafft/gene10.atp1.fasta:31
gene/mafft/gene11.atp8.fasta:31
gene/mafft/gene12.cox3.fasta:31
gene/mafft/gene13.sdh4.fasta:15
## 看下31个物种的共有基因数量
grep -c ">" gene/mafft/* | grep ":31" | wc -l
去掉非共有基因
2.
perl /share/nas6/xul/program/mt2/phytree/connection_head_tail.pl -i gene/mafft/*.fasta -o outseq && \
/share/nas6/xul/soft/trimal/trimal-1.4.1/source/trimal -in outseq/gene_seq.aln -out outseq/gene_seq.aln.trim.aln -gt 0.7 && \
nohup perl /share/nas6/xul/program/mt2/phytree/raxml_phytree.pl -i outseq/gene_seq.aln.trim.aln -aln -o ./phytree && \
nohup java -jar /share/nas6/zhouxy/biosoft/jmodeltest/jmodeltest-2.1.10/jModelTest.jar -i -f -g 4 -BIC -AIC -AICc -DT -tr 15 -o jmodeltest -d outseq/gene_seq.aln.trim.aln & # 模型预测
perl /share/nas6/xul/program/mt2/phytree/get_id_genbank.pl ../ref_tree/gbk/*.gbk > id.list && perl /share/nas6/xul/program/mt2/phytree/trans_name.pl phytree/sample.cds.nwk > cds.nwk # 进化树改名
2.2 MB
mkdir mb && cd mb
cp_aln2nexus.pl ../outseq/gene_seq.aln.trim.aln > gene.nex
/share/nas6/xul/soft/mrbayes/MrBayes-3.2.7/src/mb
Execute gene.nex
lset nst=6 rates=invgamma
mcmc ngen=1000000 samplefreq=100 printfreq=100 diagnfreq=5000
no
sump burnin=2500(ngen/samplefreq)*0.25
sumt burnin=2500
perl /share/nas6/xul/program/mt2/phytree/trans_name.pl mb/gene.nex.con.tre > mb.tre
2.3 MP
FastTreeMP -nt -gtr < gene_seq.aln.trim.aln > samples.phytree.ML.nwk
perl /share/nas6/xul/program/mt2/phytree/get_id_genbank.pl gbk/*.gbk > id.list
perl /share/nas6/xul/program/mt2/phytree/trans_name.pl phytree/sample.cds.nwk > cds.nwk
2.4 IQ-tree
#perl /share/nas6/xul/program/mt2/phytree/gene_tree/find_total_gene_intron.pl -i gbk/ -p PP231953 -o gene
iqtree2 -s outseq/gene_seq.aln.trim.aln -m MFP -B 1000 -T 40
三.高级分析
3.1 pi
cd analysis
prefix=`basename assembly/*/*.fasta .fasta`
nohup perl /share/nas6/xul/program/mt2/pi/pi.no.ir.analysis.pl -o pi -i annotation/*/*.gbk ref_ad/gbk/*.gbk -p $prefix &
同样查看下异常值,检查下对应基因序列
## 仅绘图
Bin="/share/nas6/xul/program/mt2/pi"
perl $Bin/src/plot.pi.no.ir.pl -i pi.stat.xls -o pi.gene.svg
## 按客户要求,修改宽度
python3 /share/nas1/yuj/script/chloroplast/advance/pi/plmt_plot.pi.no.ir.py -i pi.stat.xls -o pi.gene.png
# 里面规定了字体,需要在windows下绘图
3.2 cp2mt
nohup perl /share/nas6/xul/program/mt2/annotation_for_multi_seq/pipline/cp2mt_circos/bin/cp_mt2_circos.v2.pl -o cp_mt_links -mt annotation/$prefix/gene_anno/$prefix.gbk -cp ref_cp/gbk/*.gbk &
## -orf annotation/Capsicum_frutescens/orf_annotation.info
3.3 kaks
nohup perl /share/nas6/xul/program/mt2/kaks/kaks.each.two.pl -i annotation/*/*.gbk ref_ad/gbk/*.gbk -r annotation/*/*.gbk ref_ad/gbk/*.gbk -o kaks && perl /share/nas6/xul/program/chloroplast/kaks/merge_kaks.pl -i kaks/*/*.xls -o kaks/ && Rscript /share/nas6/xul/program/chloroplast/kaks/boxplot_kaks.R kaks/gene.kaks.xls kaks/gene.kaks.pdf && convert -density 300 kaks/gene.kaks.pdf kaks/gene.kaks.png &
## 改名,用于条形图,以下命令
perl /share/nas6/xul/program/mt2/kaks/get_id_genbank.pl ../ref_ad/gbk/*.gbk > id.list
for i in $prefix*kaks2.xls ;do perl /share/nas6/xul/program/mt2/kaks/trans_name.pl $i > `basename $i .kaks2.xls`_for_draw.xls ;done
sed -i 's/NA/0/g' *_for_draw.xls
for i in *_for_draw.xls; do Rscript /share/nas6/xul/program/chloroplast/kaks/plot_kaks.R $i `basename $i .xls`.pdf && convert -density 300 `basename $i .xls`.pdf `basename $i .xls`.png;done
rm *2.xls
## 箱线图
Rscript /share/nas6/xul/program/chloroplast/kaks/boxplot_kaks.R gene.kaks.xls gene.kaks.pdf && convert -density 300 gene.kaks.pdf gene.kaks.png
mkdir mid && cp */*xls mid
3.4 collinearity
mkdir collinearity && cd collinearity && mkdir fasta && cp ../ref_ad/fasta2/*.fasta fasta && cp ../assembly/*/*.fasta ./
!!!!!!!!!!!修改fasta内名称!!!!!!!!!!!! 多条序列的需要将第一条序列改成 物种_染色体名
示例:
>1
>2
改成
>Tussilago_farfara_L._1
>2
1.mummer类型显示
for i in fasta/*.fasta ;do for j in *.fasta ;do nucmer $j $i --maxmatch && show-coords out.delta > $i.`basename $j .fasta`.coords ;done ;done && rm out.delta &
n=2 && perl /share/nas6/xul/program/mt2/coline/src/plot_multi_mummer.pl -i fasta/*.coords -o mummerplot.svg -n $n && svg2xxx -dpi 300 -t png mummerplot.svg && svg2xxx -t pdf mummerplot.svg # n表示一行显示的图片框个数
2.同线性显示 synteny 不做了
perl /share/nas6/xul/program/mt2/coline/src/get_collinearity_input.pl -i *.fasta fasta/*.fasta && realpath tmp_collinearity/input.cfg & # input.cfg第一列是样品名,第四列是染色体名
# perl /share/nas6/xul/program/mt2/coline/src/draw_collinearity.pl -i tmp_collinearity/input.cfg -l tmp_collinearity/*.txt -o synteny.svg && svg2xxx -t png -dpi 300 synteny.svg # 3种颜色
perl /share/nas1/yuj/program/mt2/coline/src/draw_collinearity.pl -i tmp_collinearity/input.cfg -l tmp_collinearity/*.txt -o synteny.svg && svg2xxx -t png -dpi 300 synteny.svg # 15种颜色
【(2)使用blastn(2.10.1+)软件,设置-word_size为7,E-value为1e-5,并筛选比对长度大于300 bp的片段,对组装的物种和所选的物种依次进行比对,使用perl编写绘图程序绘制共线性图,如下所示:
注:每行的方框表示一个基因组,中间的连线表示同源性区域。】
这个图是仿照MCScanX软件 https://github.com/wyp1125/MCScanX 来的
四.其他分析
trna 二级结构
perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_rnaflod_from_trnascan_ss.pl -i *.ss
perl /share/nas6/xul/program/mt/tRNA/draw_tRNA.pl -i trn/*.svg
剪切方式、rna编辑位点(prep)绘图
1.获取各个样品的rna编辑位点
perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_cds_seq_use_gene_annotation_file.pl
cat *.txt >all.txt
2.生成配置文件,检查配置文件,注意trans
perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_edit_site_trans_cis_info.pl -i1 ../*.info -i2 *.txt > cfg
3.绘图
cfg:/share/nas6/xul/project/xianliti2/GP-20200909-2269_sichuanmongdadongwukejixueyuan_1smaple_laomangmai_xianliti2/analysis/edit_cis_trans_plot/cfg
perl /share/nas6/xul/program/mt2/annotation/pipline/src/draw_edit_site_trans_cis.pl -i cfg -p Elymus_sibiricus
绘制带有进化树的一张图
制作nwk文件,用mega调整好,放到cfg第一行,以#开头
perl /share/nas6/xul/program/mt2/annotation/pipline/src/draw_edit_site_trans_cis_for_one_fig.pl -i cfg -p Elymus_sibiricus
与核基因组之间的共线性
多序列:
perl /share/nas6/xul/program/mt2/annotation_for_multi_seq/pipline/cp2mt_circos/bin/mt_nucler_circos.pl -mt ../annotation/Prunus_davidiana/gene_anno/Prunus_davidiana.gbk -nuc GCA_020226225.1_ST_assembly_genomic.fna -o tmp
#单条序列:
#合并mt chl序列,把id分别改成mt cp!!
cat mt.fa chl.fa > mt_and_chl.fa
#获取共线性信息,修改染色体名称,改成数字!!!,然后运行:
blastn -query mt_and_chl.fa -subject chr.fa -outfmt 6 -evalue 1e-5 > blast.xls
分段表示,500,500-1000,>1000
perl -lane 'if($F[2]>=70 and $F[0] eq "mt"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) < 500)}' blast.xls > link_mt1.txt && perl -lane 'if($F[2]>=70 and $F[0] eq "mt"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 500 and abs($F[8]-$F[9]) < 1000)}' blast.xls > link_mt2.txt && perl -lane 'if($F[2]>=70 and $F[0] eq "mt"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 1000)}' blast.xls > link_mt3.txt
#perl -lane 'if($F[2]>=70 and $F[0] eq "chr1"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) < 500)}' blast.xls > link_mt1.txt && perl -lane 'if($F[2]>=70 and $F[0] eq "chr1"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 500 and abs($F[8]-$F[9]) < 1000)}' blast.xls > link_mt2.txt && perl -lane 'if($F[2]>=70 and $F[0] eq "chr1"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 1000)}' blast.xls > link_mt3.txt
perl -lane 'if($F[2]>=70 and $F[0] eq "cp"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) < 500)}' blast.xls > link_cp1.txt && perl -lane 'if($F[2]>=70 and $F[0] eq "cp"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 500 and abs($F[8]-$F[9]) < 1000)}' blast.xls > link_cp2.txt && perl -lane 'if($F[2]>=70 and $F[0] eq "cp"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 1000)}' blast.xls > link_cp3.txt
#复制配置文件到当前目录
cp /share/nas6/xul/program/mt2/cp2mt_circos/ticks.conf /share/nas6/xul/program/mt2/cp2mt_circos/ideogram.conf .
#获取sample.karyotype.txt 和配置文件circos.conf 以及
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_karyotype_for_genome.pl -m mt_and_chl.fa -g chr.fa
#运行circos程序
/share/nas6/xul/soft/circos/circos-0.69-6/bin/circos -conf circos.conf && svg2xxx -t pdf circos.svg
cp /share/nas6/xul/program/mt2/cp2mt_circos/readme_mt_genome.link.txt .
# 单序列
perl /share/nas6/xul/program/mt2/cp2mt_circos/gene_in_repeat_for_blastn.pl -i1 ../annotation/Brassica_rapa_var._purpuraria/gene_anno/Brassica_rapa_var._purpuraria.gbk -i2 blast.xls
#统计同源序列长度分布和每条染色体上同源序列占比
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_len_distri.pl -i blast_Mcor.xls -g Mcor.chr.fasta -o ./
cat chr_distri.xls | grep -v chr | cut -f 1 | perl -lane 'print "\"$_\""' | paste -s -d ","
vi /share/nas6/xul/program/mt2/cp2mt_circos/plot_len_percent.R
Rscript /share/nas6/xul/program/mt2/cp2mt_circos/plot_len_percent.R chr_distri.xls chr_distri.pdf
Rscript /share/nas6/xul/program/mt2/cp2mt_circos/plot_len_number.R len_distri.xls len_distri.pdf
物种间circos图
最外层:圆圈外侧和内侧所示的基因块分别顺时针和逆时针转录。Genes from the samecomplex are similarly colored.
0.获取线粒体注释文件,需要增加gene标签
1.获取gene highligths文件,按照基因复合物来定义颜色,识别的是gene标签,单个文件获取,然后合并
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_gene_highlights2.pl -chr chr1 -i info1
cat *_gene_name.txt > gene_name.txt
2.获取同源序列,注意序列
blastn -query chr1.fa -subject chr2.fa -outfmt 6 -evalue 1e-5 > blast.xls
perl -lane 'if($F[2]>=70){print "chr1 $F[6] $F[7] chr2 $F[8] $F[9]"}' blast.xls > link.txt
3.获取序列gc含量
perl /share/nas6/xul/program/mt2/cp2mt_circos/compute_gc_with_window.pl
cat *_gc.txt > gc.txt
4.获取覆盖深度
cat *_depth.txt > depth.txt
5.获取band信息
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_band.pl
cat *_band.txt > band.txt
3.复制配置文件到当前目录
cp /share/nas6/xul/program/mt2/cp2mt_circos/ticks.conf .
cp /share/nas6/xul/program/mt2/cp2mt_circos/ideogram2.conf ./ideogram.conf
获取sample.karyotype.txt 和配置文件circos.conf
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_karyotype2.pl -chl chr1 -mt chr2
运行circos程序
/share/nas6/xul/soft/circos/circos-0.69-6/bin/circos -conf circos.conf && svg2xxx -t pdf circos.svg
kaks使用prep预测RNA编辑位点,然后计算
1.预测
perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_cds_seq_use_gene_annotation_file.pl
保存编辑后的序列
全部解压到一个文件夹中
2.kaks计算
perl /share/nas6/xul/program/mt2/kaks/kaks.each.two_for_prep.pl -i mid/Apium_* -o kaks -r mid/*
3.kaks整理
perl /share/nas6/xul/program/mt2/kaks/src/merge_kaks_for_prep.pl -i kaks/*