一.叶绿体基因组call snp
1.准备 Picard 软件,用于标记重复
wget https://github.com/broadinstitute/picard/releases/download/2.25.7/picard.jar
wget https://github.com/broadinstitute/picard/releases/download/3.3.0/picard.jar
2.建立索引
bwa index NC_066958.1.fasta
mkdir data
for i in `awk '{print $2" "$3}' L001.samples.reads.txt`;do echo $i;ln -s $i data/;done
3.序列比对到叶绿体基因组上
cd data
想办法改改fq文件名称,使除了样品名以外的部分是一致的
ls *_S1_L008_R1_001.fastq.gz|perl -lpe 's/_S1_L008_R1_001.fastq.gz//' | wc -l # 替换掉后缀,统计样品数
ls *_S1_L008_R1_001.fastq.gz|perl -lpe 's/_S1_L008_R1_001.fastq.gz//' | parallel -j 20 "bwa mem -t 4 -M ../NC_066958.1.fasta {}_S1_L008_R1_001.fastq.gz {}_S1_L008_R2_001.fastq.gz 2>{}_map.log | samtools sort -O bam -@ 4 -o {}.sorted.bam 1>{}.sort.log 2>&1" # 最大允许同时运行20个
4.标记重复
## 需要对上述bam中的reads添加分组信息
ls *_S1_L008_R1_001.fastq.gz|perl -lpe 's/_S1_L008_R1_001.fastq.gz//' | parallel -j 30 "
/share/nas1/yuj/software/opt/jdk21/bin/java -jar /share/nas1/yuj/software/picard/3.3.0/picard.jar AddOrReplaceReadGroups -I {}.sorted.bam -O {}.sorted.rg.bam -RGID {} -RGLB {} -RGPL ILLUMINA -RGPU {} -RGSM {}"
# samtools addreplacerg -r '@RG\tID:Simian3hao\tLB:Simian3hao\tPL:ILLUMINA\tPU:Simian3hao\tSM:Simian3hao' -o output_with_rg.bam input.bam # samtools也能添加
## 标记重复
ls *_S1_L008_R1_001.fastq.gz|perl -lpe 's/_S1_L008_R1_001.fastq.gz//' | parallel -j 30 "/share/nas1/yuj/software/opt/jdk21/bin/java -jar /share/nas1/yuj/software/picard/3.3.0/picard.jar MarkDuplicates -I {}.sorted.rg.bam -O {}.sorted.mkdup.bam -M {}.sorted.mkdup.metrics 1>{}_mkdup.log 2>&1" # 新版本java
5.Call SNP
bcftools mpileup -Ou --threads 40 -f ../NC_066958.1.fasta *.mkdup.bam | bcftools call -vm --ploidy 1 --threads 40 > direct.vcf
6.PCA 分析
# 考虑调整 --maf
plink --maf 0.02 --allow-extra-chr --vcf direct.vcf --pca header tabs -out plink.stat
# 提取 PC1 和 PC2,使用 R 或者其他工具进行散点图可视化即可
perl -lpe 's/.sorted.mkdup.bam//g' plink.stat.eigenvec|cut -f1,3,4 > pca.plot.tab
7.进化树 [微信公众平台](https://mp.weixin.qq.com/s/XYMPLuxTLY3_VoC6QgvQsw)
# 使用 VCF2Dis 软件快速计算样品距离矩阵
wget https://github.com/BGI-shenzhen/VCF2Dis/archive/refs/tags/1.44.zip
unzip 1.44.zip
chmod -R a+x VCF2Dis-1.44/
./VCF2Dis-1.44/bin/VCF2Dis -InPut direct.vcf -OutPut sample.vcf.dist
直接到 FastME2 网站工具上,基于距离矩阵,绘制进化树
二.DNAbarcode DNA条形码
mafft --auto --quiet allinone.fa > 0allinone.fa.aln
perl /share/nas6/xul/program/mt2/phytree/gene_tree/src/fasta2line.pl -i 0allinone.fa.aln -o 0all.aln.fasta
dnasp加载数据,选择“DNA Polymorphism”分析-“file”-“保存当前结果”,保存为“1dnasp.out.txt”
2.筛选和格式化
python3 /share/nas1/yuj/script/chloroplast/dnabarcode/2select_region_with_pi_s.py -i 1dnasp.out.txt -fa 0all.aln.fasta -o1 3format.out -o2 4filter_midpoint.out -p 0.01 -s 50 -ref Anoectochilus_papillosus
# 默认 0.2 100
3.整体绘图
mamba activate pylatest
python3 /share/nas1/yuj/script/chloroplast/advance/pi/cp_full_genome_pi_plot.py -i 3format.out -o 3pi.full.png
mamba deactivate
4.获取比对后基因位置
python3 /share/nas1/yuj/script/chloroplast/advance/pi/position_mapping.py -i 0all.aln.fasta -p Anoectochilus_papillosus -i2 *.gbk.ann -o2 4gene.range.info
5.整理成表格
#筛选标准: pi值>0.01 S值>50
Order Region Window Midpoint Pi Theta S
1 rps4-trnT-trnL 46340-47460 46682 0.01739 0.01791 54
2 rps4-trnT-trnL 46577-47719 47291 0.02958 0.02288 69
3 trnT-trnL-trnF 47114-47956 47619 0.03069 0.0252 76
6.提取序列
删除4filter_midpoint.out中的第一行
for i in `cut -f 1 4filter_midpoint.out`;do echo $i;python3 /share/nas1/yuj/script/chloroplast/dnabarcode/extract_sequence.py -i 0all.aln.fasta -o 6.region.$i.fasta -p $i ;done
三.基因分化热图 遗传距离计算
1.把gb文件放到gbk文件夹中
2.获取同源序列
$ perl /share/nas6/xul/program/chloroplast/phytree/gene_tree/find_total_gene.pl -o gene -i gbk/ -p
$ perl /share/nas6/xul/program/chloroplast/phytree/gene_tree/find_total_gene.pl -i gbk/ -p Gentianella_pygmaea -o gene
# -p后面参数名字和gbk文件名里面名字都要一致
$ cd gene/mafft/
$ rm rrn* trn* && nu=`ls ../../gbk/*.gbk |wc |awk '{print $1}'` && rm `grep -c ">" * |grep -v ":$nu"|cut -d ":" -f 1 `
3.计算每个基因的pairwise distance
$ for i in *.fasta ;do echo Rscript /share/nas6/xul/program/chloroplast/pairwise_distance/pairwise_distance.R $i `basename $i .fasta`.distance.xls ;done > pairwise_distance.sh && thread.pl pairwise_distance.sh 63
4.整理结果
$ perl /share/nas6/xul/program/chloroplast/pairwise_distance/get_result.pl *.distance.xls Gentianella_pygmaea > all.dis.xls
$ realpath all.dis.xls # 这个文件修改第一行的物种名,不要登陆号
5.作图
$ Rscript /share/nas6/xul/program/chloroplast/pairwise_distance/pheatmap.R all.dis*.xls && convert -density 300 dis.heatmap.pdf dis.heatmap.png
四.RNA编辑预测(网站不能用)
# perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_cds_seq_use_gene_annotation_file.pl -i final_gene_annotaion.info -f *_FULLCP.fsa -p $sample -o tmp_for_prep # 线粒体
## 叶绿体
perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_cds_seq_use_gene_annotation_file_for_chl.pl -i final_gene_annotaion.info -f *_FULLCP.fsa -p $sample -o ./
生成文件*.cds.for_perp-cp.txt
上传到云平台
[生信云](http://112.86.217.82:9919/#/tool/alltool/detail/336)
用不了,结果不对
---------------------------------网站已废弃----------------------------------------------------
打开http://prep.unl.edu/上传文件
下载文件并解压 tar xf Results.tar.gz
perl /share/nas6/xul/program/mt2/annotation/pipline/src/get_edit_site_info.pl -i *_Edit_Sites.txt -p Gentianella_pygmaea -o final_rna_edit_info.xls # 合并表格
五.SSR
3.1 流程里ssr
/share/nas1/yuj/project/GP-20211130-3777-1_20220325/ssr_analysis下有misa.ini misa.pl
1.gbk转tbl
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/src/gb2tbl.pl -i 物种.gbk -o tbl
2.统计各基因外显子内含子的情况
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/src/tbl_stat.exon.pl -i tbl/物种.tbl -f 物种.fasta -o 物种.exon.intron.stat.xls
3.misa分析ssr,生成cpSSR.misa
perl misa.pl 物种.fasta cpSSR
4.解析成更容易看的格式 stat是对info的统计
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/stat.ssr.pl -i cpSSR.misa -t tbl/物种.tbl -ir 83137-109333,127168-153364 -o1 ssr.info.xls -o2 ssr.stat.xls
5.绘图
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/plot.ssr.stat.pl -i ssr.stat.xls -o ./ssr_analysis/
# region.barssr ssr在不同分区上的内含子外显子间区的分布情况
# region.pie ssr在不同分区上比例饼状图
perl /share/nas6/xul/program/chloroplast/ssr/draw_ssr.pl -i number.tmp -o ssr.number.bar
# ssr.number.bar 各类ssr计数柱状图
3.2 ssr引物设计
4286给定ssr 设计其引物
1.生成cpSSR.ssr.p3in(需要前置文件cpssr.misa(ID 编号 产物长度 开始 终止))
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_in.pl cpSSR.misa 物种.fasta ./ssr_analysis/
2.单纯设计引物,可以直接从填写cpSSR.ssr.p3in开始
3.生成cpSSR.ssr.p3out(前置文件cpSSR.ssr.p3in 可手动填写,标签详情见 http://www.chenlianfu.com/?p=284 primer3设计引物详解)
perl /share/nas6/xul/program/chloroplast/ssr/ssr_primer_designer.pl -i cpSSR.ssr.p3in -t 20 -o ../ssr_analysis
4.解析成更容易看的格式
perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_out.pl cpSSR.ssr.p3out cpSSR.misa cpSSR.ssr.primer.xls
3.3 查找多态性好的ssr(分子标记)
脚本路径:
/share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/MISA/misa.pl # misa程序
/share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/MISA/misa.ini # misa配置文件
/share/nas6/xul/program/mt2/phytree/gene_tree/src/fasta2line.pl
misa.ini参数设置:
def 1-10,2-6,3-5,4-5,5-5,6-5 或 1-10,2-5,3-4,4-3,5-3,6-3
int 100
definition(unit_size,min_repeats): 1-10 2-5 3-4 4-3 5-3 6-3
interruptions(max_difference_between_2_SSRs): 100
具体步骤:
1.简单看一下共线性
调整为同一起点
2.比对
cat fasta/*.fasta > allinone.fa
mafft --auto --quiet --thread 30 allinone.fa > all.aln.tmp && perl /share/nas6/xul/program/mt2/phytree/gene_tree/src/fasta2line.pl -i all.aln.tmp -o all.aln &
3.对每一个物种进行查找
## perl misa.pl 物种.fasta cpssr(输出文件前缀,自定义)
cd fasta && cp /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/MISA/misa* ./
for i in *.fasta;do echo $i; perl misa.pl $i ${i%.fasta} ;done
4.统计合并
cd ..
perl /share/nas6/xul/program/chloroplast/blast/mafft/aln_ssr2marker.pl -i all.aln -o ssr.xls -d fasta(misa结果所在文件夹)
TBtools对应功能:
SSRMiner && Batch Target Region Primer Design
3.4 批量设计ssr引物
cd misa_result(包含所有.fasta文件)
for i in `ls *.fasta`;do echo $i; perl misa.pl $i ${i%.fasta};done
# 生成misa
for i in *.fasta;do echo $i;mkdir ../${i%.fasta}_ssr_analysis && echo 'done';done
# 生成目录
for i in *.fasta;do echo $i;perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_in.pl ${i%.fasta}.misa $i ../${i%.fasta}_ssr_analysis/ & done
# 生成cpSSR.ssr.p3in
# 前置文件cpssr.misa(ID 编号 产物长度 开始 终止)
(可选)cd ../ && rm */cpSSR.ssr.p3out ; cd misa_result
# 删除所有目录里已有的p3out
for i in *.fasta;do echo $i;perl /share/nas6/xul/program/chloroplast/ssr/ssr_primer_designer.pl -i ../${i%.fasta}_ssr_analysis/cpSSR.ssr.p3in -t 20 -o ../${i%.fasta}_ssr_analysis/ & done
# 生成cpSSR.ssr.p3out
# 前置文件cpSSR.ssr.p3in 可手动填写,标签详情见 http://www.chenlianfu.com/?p=284 primer3设计引物详解
for i in *.fasta;do echo $i;perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_out.pl ../${i%.fasta}_ssr_analysis/cpSSR.ssr.p3out ${i%.fasta}.misa ../${i%.fasta}_ssr_analysis/cpSSR.ssr.primer.xls & done
# 解析成更容易看的格式
六.PCR引物设计
3321全长基因组来设计
## 比对
cat ref/fasta/*.fasta > all.fa
mafft --auto --quiet --thread 30 all.fa > all.aln
/share/nas6/xul/soft/trimal/trimal-1.4.1/source/trimal -automated1 -in all.aln -out all.trim.aln
perl /share/nas6/xul/program/chloroplast/blast/mafft/aln2indel_tmp.pl -i all.trim.aln > all.trim.aln.faa # 把非保守位点替换成N
vim 9.fa # 取出.faa中最后一条序列
---确保输入的序列两端至少各有一个GAP---
python3 /share/nas1/yuj/script/Modify-short-consecutive-conserved-sites-to-N/primer_all_ATGC2N.py -i 9.fa -n 20 -o all.trim.aln.faa.fa # 进一步把非连续的保守位点替换成N/较短连续保守位点修改为N
## 设计
1.填cpssr.misa文件 ID 编号 产物长度 开始 终止,生成引物设计软件的输入文件
$ perl primer3_in.pl cpSSR.misa 物种名 .../Personalisation_analysis/pcr
2.上一步生成cpSSR.ssr.p3in,或者手动填写,标签详情见 http://www.chenlianfu.com/?p=284 primer3设计引物详解
$ perl /share/nas6/xul/program/chloroplast/ssr/ssr_primer_designer.pl -i cpSSR.ssr.p3in -t 20 -o .../Personalisation_analysis/pcr
3.合并设计好的输出结果
$ perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/annotation/etc/misa/src/p3_out.pl cpSSR.ssr.p3out cpSSR.misa cpSSR.ssr.primer.xls