04叶绿体流程4-个性化分析

一.叶绿体基因组call snp

1.准备 Picard 软件,用于标记重复
wget https://github.com/broadinstitute/picard/releases/download/2.25.7/picard.jar
wget https://github.com/broadinstitute/picard/releases/download/3.3.0/picard.jar

2.建立索引
bwa index NC_066958.1.fasta
mkdir data
for i in `awk '{print $2" "$3}' L001.samples.reads.txt`;do echo $i;ln -s $i data/;done

3.序列比对到叶绿体基因组上
cd data
想办法改改fq文件名称,使除了样品名以外的部分是一致的
ls *_S1_L008_R1_001.fastq.gz|perl -lpe 's/_S1_L008_R1_001.fastq.gz//'   | wc -l # 替换掉后缀,统计样品数
ls *_S1_L008_R1_001.fastq.gz|perl -lpe 's/_S1_L008_R1_001.fastq.gz//'   | parallel -j 20 "bwa mem -t 4 -M ../NC_066958.1.fasta  {}_S1_L008_R1_001.fastq.gz  {}_S1_L008_R2_001.fastq.gz  2>{}_map.log  | samtools sort  -O  bam -@ 4  -o  {}.sorted.bam 1>{}.sort.log  2>&1" # 最大允许同时运行20个

4.标记重复
## 需要对上述bam中的reads添加分组信息
ls *_S1_L008_R1_001.fastq.gz|perl -lpe 's/_S1_L008_R1_001.fastq.gz//'   | parallel -j 30 "
/share/nas1/yuj/software/opt/jdk21/bin/java  -jar /share/nas1/yuj/software/picard/3.3.0/picard.jar  AddOrReplaceReadGroups  -I  {}.sorted.bam -O  {}.sorted.rg.bam -RGID {}  -RGLB {}  -RGPL ILLUMINA  -RGPU {}  -RGSM {}"
# samtools addreplacerg -r '@RG\tID:Simian3hao\tLB:Simian3hao\tPL:ILLUMINA\tPU:Simian3hao\tSM:Simian3hao' -o output_with_rg.bam input.bam # samtools也能添加

## 标记重复
ls *_S1_L008_R1_001.fastq.gz|perl -lpe 's/_S1_L008_R1_001.fastq.gz//'   | parallel -j 30 "/share/nas1/yuj/software/opt/jdk21/bin/java  -jar /share/nas1/yuj/software/picard/3.3.0/picard.jar  MarkDuplicates -I  {}.sorted.rg.bam -O  {}.sorted.mkdup.bam -M  {}.sorted.mkdup.metrics  1>{}_mkdup.log 2>&1" # 新版本java

5.Call SNP
bcftools mpileup  -Ou --threads 40 -f  ../NC_066958.1.fasta   *.mkdup.bam  |  bcftools call -vm --ploidy 1  --threads 40 > direct.vcf

6.PCA 分析
# 考虑调整 --maf
plink --maf 0.02 --allow-extra-chr --vcf direct.vcf --pca header tabs -out plink.stat
# 提取 PC1 和 PC2,使用 R 或者其他工具进行散点图可视化即可
perl -lpe 's/.sorted.mkdup.bam//g' plink.stat.eigenvec|cut -f1,3,4 > pca.plot.tab

7.进化树 [微信公众平台](https://mp.weixin.qq.com/s/XYMPLuxTLY3_VoC6QgvQsw)
# 使用 VCF2Dis 软件快速计算样品距离矩阵
wget https://github.com/BGI-shenzhen/VCF2Dis/archive/refs/tags/1.44.zip
unzip 1.44.zip
chmod -R a+x VCF2Dis-1.44/
./VCF2Dis-1.44/bin/VCF2Dis -InPut direct.vcf -OutPut sample.vcf.dist
直接到 FastME2 网站工具上,基于距离矩阵,绘制进化树

二.DNAbarcode DNA条形码

mafft  --auto  --quiet  allinone.fa  > 0allinone.fa.aln
perl /share/nas6/xul/program/mt2/phytree/gene_tree/src/fasta2line.pl -i 0allinone.fa.aln -o 0all.aln.fasta
dnasp加载数据,选择“DNA Polymorphism”分析-“file”-“保存当前结果”,保存为“1dnasp.out.txt”

2.筛选和格式化
python3 /share/nas1/yuj/script/chloroplast/dnabarcode/2select_region_with_pi_s.py  -i  1dnasp.out.txt  -fa   0all.aln.fasta    -o1  3format.out  -o2  4filter_midpoint.out   -p 0.01 -s  50  -ref  Anoectochilus_papillosus
# 默认 0.2  100

3.整体绘图
mamba activate pylatest
python3 /share/nas1/yuj/script/chloroplast/advance/pi/cp_full_genome_pi_plot.py -i 3format.out -o 3pi.full.png
mamba deactivate

4.获取比对后基因位置
python3 /share/nas1/yuj/script/chloroplast/advance/pi/position_mapping.py -i 0all.aln.fasta -p Anoectochilus_papillosus -i2 *.gbk.ann -o2 4gene.range.info

5.整理成表格
#筛选标准: pi值>0.01 S值>50						
Order	Region	Window	Midpoint	Pi	Theta	S
1	rps4-trnT-trnL	46340-47460	46682	0.01739	0.01791	54
2	rps4-trnT-trnL	46577-47719	47291	0.02958	0.02288	69
3	trnT-trnL-trnF	47114-47956	47619	0.03069	0.0252	76

6.提取序列
删除4filter_midpoint.out中的第一行
for i in `cut -f 1 4filter_midpoint.out`;do echo $i;python3 /share/nas1/yuj/script/chloroplast/dnabarcode/extract_sequence.py -i 0all.aln.fasta -o  6.region.$i.fasta -p $i ;done

三.基因分化热图 遗传距离计算

1.把gb文件放到gbk文件夹中

2.获取同源序列
$ perl /share/nas6/xul/program/chloroplast/phytree/gene_tree/find_total_gene.pl  -o gene -i gbk/ -p 
$ perl /share/nas6/xul/program/chloroplast/phytree/gene_tree/find_total_gene.pl  -i gbk/ -p Gentianella_pygmaea  -o gene   
# -p后面参数名字和gbk文件名里面名字都要一致
$ cd gene/mafft/
$ rm rrn* trn* && nu=`ls ../../gbk/*.gbk |wc |awk '{print $1}'` && rm `grep -c ">" * |grep -v ":$nu"|cut -d ":" -f 1 `

3.计算每个基因的pairwise distance
$ for i in *.fasta ;do echo Rscript /share/nas6/xul/program/chloroplast/pairwise_distance/pairwise_distance.R $i `basename $i .fasta`.distance.xls ;done > pairwise_distance.sh && thread.pl pairwise_distance.sh  63

4.整理结果
$ perl /share/nas6/xul/program/chloroplast/pairwise_distance/get_result.pl *.distance.xls  Gentianella_pygmaea >  all.dis.xls
$ realpath all.dis.xls # 这个文件修改第一行的物种名,不要登陆号

5.作图
$ Rscript /share/nas6/xul/program/chloroplast/pairwise_distance/pheatmap.R all.dis*.xls && convert -density 300 dis.heatmap.pdf  dis.heatmap.png

四.RNA编辑预测(网站不能用)

# perl  /share/nas6/xul/program/mt2/annotation/pipline/src/get_cds_seq_use_gene_annotation_file.pl  -i final_gene_annotaion.info  -f *_FULLCP.fsa -p $sample -o tmp_for_prep # 线粒体

## 叶绿体
perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_cds_seq_use_gene_annotation_file_for_chl.pl -i final_gene_annotaion.info  -f *_FULLCP.fsa -p $sample -o ./ 
生成文件*.cds.for_perp-cp.txt
上传到云平台
[生信云](http://112.86.217.82:9919/#/tool/alltool/detail/336)
用不了,结果不对

---------------------------------网站已废弃----------------------------------------------------
打开http://prep.unl.edu/上传文件
下载文件并解压 tar xf Results.tar.gz 
perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_edit_site_info.pl -i *_Edit_Sites.txt -p Gentianella_pygmaea -o  final_rna_edit_info.xls # 合并表格

五.SSR

3.1 流程里ssr

/share/nas1/yuj/project/GP-20211130-3777-1_20220325/ssr_analysis下有misa.ini misa.pl

1.gbk转tbl
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/src/gb2tbl.pl -i 物种.gbk -o tbl

2.统计各基因外显子内含子的情况
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/src/tbl_stat.exon.pl -i tbl/物种.tbl -f 物种.fasta -o 物种.exon.intron.stat.xls

3.misa分析ssr,生成cpSSR.misa
perl misa.pl 物种.fasta cpSSR

4.解析成更容易看的格式 stat是对info的统计
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/stat.ssr.pl -i cpSSR.misa -t tbl/物种.tbl -ir 83137-109333,127168-153364 -o1 ssr.info.xls -o2 ssr.stat.xls

5.绘图
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/plot.ssr.stat.pl -i ssr.stat.xls -o ./ssr_analysis/ 
# region.barssr	ssr在不同分区上的内含子外显子间区的分布情况
# region.pie	ssr在不同分区上比例饼状图

perl /share/nas6/xul/program/chloroplast/ssr/draw_ssr.pl -i number.tmp -o ssr.number.bar
# ssr.number.bar	各类ssr计数柱状图

3.2 ssr引物设计

4286给定ssr 设计其引物

1.生成cpSSR.ssr.p3in(需要前置文件cpssr.misa(ID 编号 产物长度 开始 终止))
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_in.pl cpSSR.misa 物种.fasta ./ssr_analysis/

2.单纯设计引物,可以直接从填写cpSSR.ssr.p3in开始

3.生成cpSSR.ssr.p3out(前置文件cpSSR.ssr.p3in  可手动填写,标签详情见 http://www.chenlianfu.com/?p=284 	primer3设计引物详解)
perl /share/nas6/xul/program/chloroplast/ssr/ssr_primer_designer.pl -i cpSSR.ssr.p3in -t 20 -o ../ssr_analysis

4.解析成更容易看的格式
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_out.pl cpSSR.ssr.p3out cpSSR.misa cpSSR.ssr.primer.xls

3.3 查找多态性好的ssr(分子标记)

脚本路径:
/share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/MISA/misa.pl # misa程序
/share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/MISA/misa.ini # misa配置文件
/share/nas6/xul/program/mt2/phytree/gene_tree/src/fasta2line.pl

misa.ini参数设置:
def 1-10,2-6,3-5,4-5,5-5,6-5 或 1-10,2-5,3-4,4-3,5-3,6-3	
int 100

definition(unit_size,min_repeats):                  1-10 2-5 3-4 4-3 5-3 6-3	
interruptions(max_difference_between_2_SSRs):        100

具体步骤:
1.简单看一下共线性
调整为同一起点

2.比对
cat fasta/*.fasta > allinone.fa
mafft --auto --quiet --thread 30 allinone.fa > all.aln.tmp && perl /share/nas6/xul/program/mt2/phytree/gene_tree/src/fasta2line.pl -i all.aln.tmp -o all.aln &

3.对每一个物种进行查找
## perl misa.pl 物种.fasta cpssr(输出文件前缀,自定义)
cd fasta && cp /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/MISA/misa*  ./
for i in *.fasta;do echo $i; perl misa.pl $i ${i%.fasta} ;done

4.统计合并
cd ..
perl /share/nas6/xul/program/chloroplast/blast/mafft/aln_ssr2marker.pl -i all.aln -o ssr.xls -d fasta(misa结果所在文件夹)

TBtools对应功能:
SSRMiner && Batch Target Region Primer Design

3.4 批量设计ssr引物

cd misa_result(包含所有.fasta文件)

for i in `ls *.fasta`;do echo $i; perl misa.pl $i ${i%.fasta};done
# 生成misa

for i in *.fasta;do echo $i;mkdir ../${i%.fasta}_ssr_analysis && echo 'done';done
# 生成目录

for i in *.fasta;do echo $i;perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_in.pl ${i%.fasta}.misa $i ../${i%.fasta}_ssr_analysis/  & done
# 生成cpSSR.ssr.p3in	
# 前置文件cpssr.misa(ID 编号 产物长度 开始 终止)

(可选)cd ../ && rm */cpSSR.ssr.p3out ; cd misa_result
# 删除所有目录里已有的p3out

for i in *.fasta;do echo $i;perl /share/nas6/xul/program/chloroplast/ssr/ssr_primer_designer.pl -i ../${i%.fasta}_ssr_analysis/cpSSR.ssr.p3in -t 20 -o ../${i%.fasta}_ssr_analysis/  & done
# 生成cpSSR.ssr.p3out
# 前置文件cpSSR.ssr.p3in  可手动填写,标签详情见 http://www.chenlianfu.com/?p=284 	primer3设计引物详解

for i in *.fasta;do echo $i;perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_out.pl ../${i%.fasta}_ssr_analysis/cpSSR.ssr.p3out ${i%.fasta}.misa ../${i%.fasta}_ssr_analysis/cpSSR.ssr.primer.xls & done
# 解析成更容易看的格式

六.PCR引物设计

3321全长基因组来设计

## 比对
cat ref/fasta/*.fasta > all.fa
mafft --auto --quiet --thread 30 all.fa > all.aln
/share/nas6/xul/soft/trimal/trimal-1.4.1/source/trimal -automated1 -in all.aln -out all.trim.aln
perl /share/nas6/xul/program/chloroplast/blast/mafft/aln2indel_tmp.pl -i all.trim.aln > all.trim.aln.faa # 把非保守位点替换成N

vim 9.fa # 取出.faa中最后一条序列
---确保输入的序列两端至少各有一个GAP---
python3 /share/nas1/yuj/script/Modify-short-consecutive-conserved-sites-to-N/primer_all_ATGC2N.py -i 9.fa -n 20 -o all.trim.aln.faa.fa # 进一步把非连续的保守位点替换成N/较短连续保守位点修改为N


## 设计
1.填cpssr.misa文件 ID 编号 产物长度 开始 终止,生成引物设计软件的输入文件
$ perl primer3_in.pl cpSSR.misa 物种名 .../Personalisation_analysis/pcr

2.上一步生成cpSSR.ssr.p3in,或者手动填写,标签详情见 http://www.chenlianfu.com/?p=284 	primer3设计引物详解
$ perl /share/nas6/xul/program/chloroplast/ssr/ssr_primer_designer.pl -i cpSSR.ssr.p3in -t 20 -o .../Personalisation_analysis/pcr

3.合并设计好的输出结果
$ perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_out.pl cpSSR.ssr.p3out cpSSR.misa  cpSSR.ssr.primer.xls