05植物线粒体流程3-高级分析

一.数据准备

1.1 ref下载

## 复制表格中的物种 至 tmp_ad/tree/cp文件(以+号为分割)
# python3 /share/nas1/yuj/script/chloroplast/download_genome_from_ncbi.py -f gb -a list_tre # 备用下载

编号目录]
touch tmp_{ad,tree,cp}

awk -F "+" '{ print $2 }' tmp_ad | awk '$1=$1' > list_ad && cat list_ad && down- ad &
awk -F "+" '{ print $2 }' tmp_tree | awk '$1=$1' > list_tree && cat list_tree && down- tree &
awk -F "+" '{ print $2 }' tmp_cp | awk '$1=$1' > list_cp && cat list_cp && down- cp &

mv ref_* analysis/

1.2 测序样品及ref处理(多序列物种)

需要合并多序列物种的gb文件,变成单序列,方便后续分析
1.对样品进行处理,合并gene_anno中单个gb(1.gb、2.gb、3.gb等)文件到上一级目录analysis/annotation/Polygala_tenuifolia_Willd中(组装物种多序列时使用)

perl /share/nas6/xul/program/mt2/annotation/pipline/src/merge_multi_chr_mt_gbk.pl -i gene_anno/*.gbk > `basename *.fasta .fasta`.gbk 
!!!不能有相同的物种名 !!! ???

--------- 先看2.0章节检查物种
2.对高级分析和进化树物种处理
!!!只能一个一个物种处理!!!

## 合并高级分析中多环线粒体的gb文件(高级分析物种多序列时使用)
perl /share/nas6/xul/program/mt2/annotation/pipline/src/merge_multi_chr_mt_gbk.pl -i *.gbk > merge.gbk

## fasta2中的多序列要cat到一个文件中,然后重命名
比如说参考A有两条序列都在fasta中
cat fasta2/* > merge.fasta

每一个合并后的fa文件里面都是多条序列,然后用下面的命令对多序列重命名
for j in *.fasta ;do perl -i  -lane 'if(/^>/){if($. == 1){print "$_","_",++$i}else{print ">",++$i}}else{print}' $j ; done

1.3 改名

## 给gbk改名
GP-XXX]$ cd ref_adv/gbk && rename .gbk .1.gbk * && cd - # 加上.1,一般不需要
GP-XXX]$ cd ref_adv/fasta && for i in *.fasta;do rename               `awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print ""$1""".1" }'`       `awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print $2"""_"""$1""".1" }'`   ../gbk/""`awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print ""$1""".1" }'`.gbk  ;done  && cd ../../

## 给fasta改名
GP-XXX]$ cd ref_adv/fasta && rename .fasta .1.fasta * && cd - # 加上.1,一般不需要
GP-XXX]$ cd ref_adv/fasta && for i in *.fasta;do rename              `awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print ""$1""".1" }'`       `awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print $2"""_"""$1""".1" }'`   ../fasta/""`awk '{if(NR==1) print $0}' $i | awk -F ">" '{ print $2 }' | awk -F ".1_" '{ print ""$1""".1" }'`.fasta  ;done  && cd ../../

二.共有CDS进化树

2.0 检查进化树物种

prefix=`basename assembly/*/*.fasta .fasta` # 设置下前缀名

cd ref_tree/gbk
grep DEFI * # 看下是不是都是完整的
grep -c "   CDS" * # 确保都有注释
grep DEFI *  | grep chrom # 看下有没有把一个线粒体的多序列分好几个gbk,有的话需要全部下下来,用上面的ref合并处理

2.1 ML

cd analysis
mkdir phytree && cd phytree && mkdir gbk/ && cp ../annotation/*/*.gbk  ../ref_tree/gbk/*.gbk gbk # 这里用的单序列

1.提取基因比对
perl /share/nas6/xul/program/mt2/phytree/gene_tree/find_total_gene.pl  -i gbk/  -o gene -p $prefix && \

## 可以不管,一般不考虑
for i in gene/mafft/*.fasta ; do echo $i;  perl /share/nas6/xul/program/chloroplast/pi/compute_pi_use_aln_fasta.pl  -i  $i | grep Nucleotide  ;done && \ # 可以先运行这一步,看一下pi值,把异常的基因从其物种gbk中删除
##  可以不管,一般不考虑
## 查看下数量,例子如下
ls gbk/ |wc
31      31     501 # 一共31个物种
## 看下每个基因是否都在31个物种里
grep -c ">" gene/mafft/*
gene/mafft/gene10.atp1.fasta:31
gene/mafft/gene11.atp8.fasta:31
gene/mafft/gene12.cox3.fasta:31
gene/mafft/gene13.sdh4.fasta:15
## 看下31个物种的共有基因数量
grep -c ">" gene/mafft/*  | grep ":31"  | wc -l
去掉非共有基因

2.
perl /share/nas6/xul/program/mt2/phytree/connection_head_tail.pl -i gene/mafft/*.fasta -o outseq && \

/share/nas6/xul/soft/trimal/trimal-1.4.1/source/trimal -in outseq/gene_seq.aln  -out outseq/gene_seq.aln.trim.aln -gt 0.7 && \

nohup perl /share/nas6/xul/program/mt2/phytree/raxml_phytree.pl -i outseq/gene_seq.aln.trim.aln -aln  -o ./phytree && \

nohup java -jar  /share/nas6/zhouxy/biosoft/jmodeltest/jmodeltest-2.1.10/jModelTest.jar -i -f -g 4 -BIC -AIC -AICc -DT -tr 15 -o jmodeltest -d  outseq/gene_seq.aln.trim.aln & # 模型预测
 
perl /share/nas6/xul/program/mt2/phytree/get_id_genbank.pl ../ref_tree/gbk/*.gbk > id.list && perl /share/nas6/xul/program/mt2/phytree/trans_name.pl phytree/sample.cds.nwk > cds.nwk # 进化树改名

2.2 MB

mkdir mb && cd mb

cp_aln2nexus.pl  ../outseq/gene_seq.aln.trim.aln > gene.nex

/share/nas6/xul/soft/mrbayes/MrBayes-3.2.7/src/mb

Execute gene.nex
lset nst=6 rates=invgamma
mcmc ngen=1000000 samplefreq=100 printfreq=100 diagnfreq=5000
no
sump burnin=2500(ngen/samplefreq)*0.25
sumt burnin=2500

perl /share/nas6/xul/program/mt2/phytree/trans_name.pl mb/gene.nex.con.tre > mb.tre

2.3 MP

FastTreeMP  -nt -gtr < gene_seq.aln.trim.aln > samples.phytree.ML.nwk

perl /share/nas6/xul/program/mt2/phytree/get_id_genbank.pl gbk/*.gbk > id.list

perl /share/nas6/xul/program/mt2/phytree/trans_name.pl phytree/sample.cds.nwk > cds.nwk

2.4 IQ-tree

#perl /share/nas6/xul/program/mt2/phytree/gene_tree/find_total_gene_intron.pl -i gbk/ -p PP231953  -o  gene

iqtree2 -s outseq/gene_seq.aln.trim.aln -m MFP  -B 1000 -T 40

三.高级分析

3.1 pi

cd analysis
prefix=`basename assembly/*/*.fasta .fasta`

nohup perl /share/nas6/xul/program/mt2/pi/pi.no.ir.analysis.pl  -o pi -i annotation/*/*.gbk  ref_ad/gbk/*.gbk -p $prefix &
同样查看下异常值,检查下对应基因序列

## 仅绘图
Bin="/share/nas6/xul/program/mt2/pi"
perl $Bin/src/plot.pi.no.ir.pl -i pi.stat.xls -o pi.gene.svg

## 按客户要求,修改宽度
python3 /share/nas1/yuj/script/chloroplast/advance/pi/plmt_plot.pi.no.ir.py  -i pi.stat.xls -o pi.gene.png
# 里面规定了字体,需要在windows下绘图

3.2 cp2mt

nohup perl /share/nas6/xul/program/mt2/annotation_for_multi_seq/pipline/cp2mt_circos/bin/cp_mt2_circos.v2.pl -o cp_mt_links  -mt annotation/$prefix/gene_anno/$prefix.gbk -cp ref_cp/gbk/*.gbk &
## -orf annotation/Capsicum_frutescens/orf_annotation.info

3.3 kaks

nohup perl /share/nas6/xul/program/mt2/kaks/kaks.each.two.pl -i annotation/*/*.gbk ref_ad/gbk/*.gbk -r annotation/*/*.gbk ref_ad/gbk/*.gbk -o kaks && perl /share/nas6/xul/program/chloroplast/kaks/merge_kaks.pl -i kaks/*/*.xls -o kaks/ && Rscript /share/nas6/xul/program/chloroplast/kaks/boxplot_kaks.R kaks/gene.kaks.xls  kaks/gene.kaks.pdf  && convert -density 300 kaks/gene.kaks.pdf kaks/gene.kaks.png & 

## 改名,用于条形图,以下命令
perl /share/nas6/xul/program/mt2/kaks/get_id_genbank.pl ../ref_ad/gbk/*.gbk > id.list
for i in $prefix*kaks2.xls ;do perl /share/nas6/xul/program/mt2/kaks/trans_name.pl $i > `basename $i .kaks2.xls`_for_draw.xls ;done
sed -i 's/NA/0/g' *_for_draw.xls
for i in *_for_draw.xls; do Rscript /share/nas6/xul/program/chloroplast/kaks/plot_kaks.R $i `basename $i .xls`.pdf && convert -density 300 `basename $i .xls`.pdf `basename $i .xls`.png;done
rm *2.xls

## 箱线图
Rscript /share/nas6/xul/program/chloroplast/kaks/boxplot_kaks.R gene.kaks.xls  gene.kaks.pdf  && convert -density 300 gene.kaks.pdf gene.kaks.png

mkdir mid && cp */*xls mid

3.4 collinearity

mkdir collinearity && cd collinearity && mkdir fasta && cp ../ref_ad/fasta2/*.fasta fasta && cp ../assembly/*/*.fasta ./

!!!!!!!!!!!修改fasta内名称!!!!!!!!!!!! 多条序列的需要将第一条序列改成 物种_染色体名
示例:
>1
>2
改成
>Tussilago_farfara_L._1
>2

1.mummer类型显示 
for i in fasta/*.fasta ;do for j in *.fasta ;do nucmer $j $i --maxmatch && show-coords out.delta > $i.`basename $j .fasta`.coords ;done ;done && rm out.delta &

n=2 && perl /share/nas6/xul/program/mt2/coline/src/plot_multi_mummer.pl -i fasta/*.coords -o mummerplot.svg  -n $n && svg2xxx -dpi 300  -t png mummerplot.svg && svg2xxx   -t pdf  mummerplot.svg # n表示一行显示的图片框个数

2.同线性显示 synteny 不做了
perl /share/nas6/xul/program/mt2/coline/src/get_collinearity_input.pl -i *.fasta  fasta/*.fasta && realpath tmp_collinearity/input.cfg & # input.cfg第一列是样品名,第四列是染色体名

# perl /share/nas6/xul/program/mt2/coline/src/draw_collinearity.pl -i tmp_collinearity/input.cfg  -l tmp_collinearity/*.txt -o synteny.svg  && svg2xxx -t png -dpi 300 synteny.svg # 3种颜色
perl /share/nas1/yuj/program/mt2/coline/src/draw_collinearity.pl -i tmp_collinearity/input.cfg  -l tmp_collinearity/*.txt -o synteny.svg  && svg2xxx -t png -dpi 300 synteny.svg # 15种颜色

【(2)使用blastn(2.10.1+)软件,设置-word_size为7,E-value为1e-5,并筛选比对长度大于300 bp的片段,对组装的物种和所选的物种依次进行比对,使用perl编写绘图程序绘制共线性图,如下所示:  
注:每行的方框表示一个基因组,中间的连线表示同源性区域。】  
这个图是仿照MCScanX软件 https://github.com/wyp1125/MCScanX 来的

四.其他分析

trna 二级结构

perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_rnaflod_from_trnascan_ss.pl -i *.ss

perl /share/nas6/xul/program/mt/tRNA/draw_tRNA.pl -i trn/*.svg 

剪切方式、rna编辑位点(prep)绘图

1.获取各个样品的rna编辑位点
perl  /share/nas6/xul/program/mt2/annotation/pipline/src/get_cds_seq_use_gene_annotation_file.pl 
cat *.txt >all.txt

2.生成配置文件,检查配置文件,注意trans
perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_edit_site_trans_cis_info.pl -i1 ../*.info -i2 *.txt > cfg

3.绘图
cfg:/share/nas6/xul/project/xianliti2/GP-20200909-2269_sichuanmongdadongwukejixueyuan_1smaple_laomangmai_xianliti2/analysis/edit_cis_trans_plot/cfg

perl /share/nas6/xul/program/mt2/annotation/pipline/src/draw_edit_site_trans_cis.pl -i cfg -p Elymus_sibiricus

绘制带有进化树的一张图

制作nwk文件,用mega调整好,放到cfg第一行,以#开头

perl /share/nas6/xul/program/mt2/annotation/pipline/src/draw_edit_site_trans_cis_for_one_fig.pl -i cfg -p Elymus_sibiricus

与核基因组之间的共线性

多序列:
perl /share/nas6/xul/program/mt2/annotation_for_multi_seq/pipline/cp2mt_circos/bin/mt_nucler_circos.pl -mt ../annotation/Prunus_davidiana/gene_anno/Prunus_davidiana.gbk -nuc GCA_020226225.1_ST_assembly_genomic.fna -o tmp

#单条序列:

#合并mt chl序列,把id分别改成mt  cp!!
cat mt.fa chl.fa > mt_and_chl.fa 

#获取共线性信息,修改染色体名称,改成数字!!!,然后运行:
blastn  -query  mt_and_chl.fa  -subject chr.fa  -outfmt 6 -evalue 1e-5  > blast.xls

分段表示,500,500-1000,>1000
perl -lane 'if($F[2]>=70 and $F[0] eq "mt"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) < 500)}' blast.xls > link_mt1.txt && perl -lane 'if($F[2]>=70 and $F[0] eq "mt"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 500 and abs($F[8]-$F[9]) < 1000)}' blast.xls > link_mt2.txt &&  perl -lane 'if($F[2]>=70 and $F[0] eq "mt"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 1000)}' blast.xls > link_mt3.txt 

#perl -lane 'if($F[2]>=70 and $F[0] eq "chr1"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) < 500)}' blast.xls > link_mt1.txt && perl -lane 'if($F[2]>=70 and $F[0] eq "chr1"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 500 and abs($F[8]-$F[9]) < 1000)}' blast.xls > link_mt2.txt &&  perl -lane 'if($F[2]>=70 and $F[0] eq "chr1"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 1000)}' blast.xls > link_mt3.txt 

perl -lane 'if($F[2]>=70 and $F[0] eq "cp"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) < 500)}' blast.xls > link_cp1.txt && perl -lane 'if($F[2]>=70 and $F[0] eq "cp"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 500 and abs($F[8]-$F[9]) < 1000)}' blast.xls > link_cp2.txt &&  perl -lane 'if($F[2]>=70 and $F[0] eq "cp"){print "$F[1] $F[8] $F[9] $F[0] $F[6] $F[7]" if(abs($F[8]-$F[9]) >= 1000)}' blast.xls > link_cp3.txt 

#复制配置文件到当前目录
cp /share/nas6/xul/program/mt2/cp2mt_circos/ticks.conf /share/nas6/xul/program/mt2/cp2mt_circos/ideogram.conf .

#获取sample.karyotype.txt 和配置文件circos.conf 以及
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_karyotype_for_genome.pl -m mt_and_chl.fa -g chr.fa

#运行circos程序
/share/nas6/xul/soft/circos/circos-0.69-6/bin/circos  -conf circos.conf && svg2xxx -t pdf circos.svg

cp /share/nas6/xul/program/mt2/cp2mt_circos/readme_mt_genome.link.txt .

# 单序列
perl /share/nas6/xul/program/mt2/cp2mt_circos/gene_in_repeat_for_blastn.pl -i1 ../annotation/Brassica_rapa_var._purpuraria/gene_anno/Brassica_rapa_var._purpuraria.gbk  -i2 blast.xls

#统计同源序列长度分布和每条染色体上同源序列占比
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_len_distri.pl -i blast_Mcor.xls -g Mcor.chr.fasta  -o ./

cat chr_distri.xls  | grep -v chr  | cut -f 1  | perl -lane 'print "\"$_\""' | paste -s -d ","

vi /share/nas6/xul/program/mt2/cp2mt_circos/plot_len_percent.R

Rscript /share/nas6/xul/program/mt2/cp2mt_circos/plot_len_percent.R chr_distri.xls  chr_distri.pdf

Rscript /share/nas6/xul/program/mt2/cp2mt_circos/plot_len_number.R len_distri.xls len_distri.pdf

物种间circos图


最外层:圆圈外侧和内侧所示的基因块分别顺时针和逆时针转录。Genes from the samecomplex are similarly colored. 

0.获取线粒体注释文件,需要增加gene标签

1.获取gene highligths文件,按照基因复合物来定义颜色,识别的是gene标签,单个文件获取,然后合并
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_gene_highlights2.pl -chr chr1 -i info1

cat *_gene_name.txt > gene_name.txt

2.获取同源序列,注意序列
blastn  -query  chr1.fa -subject chr2.fa   -outfmt 6 -evalue 1e-5  > blast.xls

perl -lane 'if($F[2]>=70){print "chr1 $F[6] $F[7] chr2 $F[8] $F[9]"}' blast.xls > link.txt

3.获取序列gc含量
perl /share/nas6/xul/program/mt2/cp2mt_circos/compute_gc_with_window.pl

cat *_gc.txt > gc.txt

4.获取覆盖深度
cat *_depth.txt > depth.txt

5.获取band信息
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_band.pl 

cat *_band.txt > band.txt


3.复制配置文件到当前目录
cp /share/nas6/xul/program/mt2/cp2mt_circos/ticks.conf .

cp /share/nas6/xul/program/mt2/cp2mt_circos/ideogram2.conf ./ideogram.conf

获取sample.karyotype.txt 和配置文件circos.conf
perl /share/nas6/xul/program/mt2/cp2mt_circos/get_karyotype2.pl -chl chr1 -mt chr2 

运行circos程序
/share/nas6/xul/soft/circos/circos-0.69-6/bin/circos  -conf circos.conf && svg2xxx -t pdf circos.svg

kaks使用prep预测RNA编辑位点,然后计算

1.预测
perl  /share/nas6/xul/program/mt2/annotation/pipline/src/get_cds_seq_use_gene_annotation_file.pl 
保存编辑后的序列
全部解压到一个文件夹中

2.kaks计算
perl /share/nas6/xul/program/mt2/kaks/kaks.each.two_for_prep.pl -i mid/Apium_* -o kaks -r mid/*

3.kaks整理
perl /share/nas6/xul/program/mt2/kaks/src/merge_kaks_for_prep.pl -i kaks/*