04叶绿体流程1-组装
三代数据HIFI
组装
1.转换
bamToFastq -i *.bam -fq unmapped.fastq # PacBio
2.比对
assref=ref.fa && pbfile=unmapped.fastq && nu=1000000 && minimap2 -t 20 -ax map-pb $assref $pbfile > extend.sam && samtools view -F 4 extend.sam | perl -lane 'print unless($F[9] eq "*")' | perl -ane 'print if(/^@/);if(/NM:i:(\d+)/){$n=$1;$l=0;$l+=$1 while $F[5]=~ /(\d+)[M]/g;if($l > 5000){print}}'|sort -k 4 -n| tail -n $nu > tmp.sam && cat tmp.sam |cut -f 10 |perl -lane 'print ">",++$i;print $F[0]' > map_gene.fa
3.校正
mamba activate pb-assembly
/share/nas6/zhouxy/biosoft/perl/current/bin/perl /share/nas1/yuj/software/micromamba/envs/pb-assembly/bin/canu -correct -p correct -d correct genomeSize=200k useGrid=false -pacbio map_gene.fa
4.组装
mamba activate chloroplast
ptGAUL.sh -r ref.fa -l correct/correct.correctedReads.fasta.gz -t 8 -f 3000 -o ./
mamba deactivate
质控
minimap2 -ax map-pb Clematis_cadmia_FULLCP.fsa unmapped.fastq > aln.sam && samtools view -h -F 4 aln.sam |samtools sort > 01_map.sort.bam && samtools index 01_map.sort.bam && samtools depth -m 100000 -a 01_map.sort.bam |awk '{print "mt1",$2,$2,$3}' > sample.depth.txt && python3 /share/nas1/yuj/script/chloroplast/annotation/draw_line_depth_v2.py sample.depth.txt line_depth.png
一.前期准备
数据位置
/share/nas1/seqloader/yelvti/GP-xxxx_yelvti/2.clean_data
#先看污染比对结果!!!!有问题直接先反馈!!!!
1.污染比对
perl /share/nas6/zhouxy/functional_modules/pollution_nt_blast/pollution_nt_blast_pip_v3.pl -fqdir 路径/2.clean_data -od 输出目录
2.得到ass文件,改成拉丁名,下划线
python3 /share/nas1/yuj/script/chloroplast/get_ass_cfg.py -i 路径(邮件分发) #会自动补上2.clean_data
二.组装
数据库位置
/share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/assembly/dat/cpdb/plastid.genome.sequence.fasta
nohup perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/assembly/cp.assembly.pip.pl -i ass.cfg & # 组装核心流程
nohup perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/assembly/cp.assembly.pip3.pl -i ass.cfg & ???没修改好,不要用
当bowtie超过指定大小立马组装
以下脚本为:
当bowtie超过50M或者3_Spades_Assembly已经创建,切断流程,进行组装
#!/bin/bash
while true; do
if [ -d "analysis/assembly/"*"/pseudo/"*"/3_Spades_Assembly" ]; then
echo "文件夹 '3_Spades_Assembly' 已创建"
break # 跳出循环
fi
if [ -f "analysis/assembly/"*"/pseudo/"*"/2_Bowtie_Mapping/map_pair_hits.1.fq" ]; then
if ((`stat -c %s "analysis/assembly/"*"/pseudo/"*"/2_Bowtie_Mapping/map_pair_hits.1.fq"` > 50 * 1024 * 1024)); then
echo "文件 'map_pair_hits.1.fq' 大小超过50M"
break # 跳出循环
fi
fi
sleep 30 # 等待一段时间后再次进行检查
done
echo "条件满足,可以继续执行后续命令"
# 获取之前的进程ID
previous_pid=$(pgrep -f "/share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/assembly/cp.assembly.pip.pl")
# 终止之前的进程
if [ -n "$previous_pid" ]; then
kill "$previous_pid"
fi
cd "analysis/assembly/"*"/pseudo/"*"/2_Bowtie_Mapping"
head -n 500000 map_pair_hits.1.fq > cut.1.fq
head -n 500000 map_pair_hits.2.fq > cut.2.fq
unicycler.py -1 cut.1.fq -2 cut.2.fq -o uni
三.组装调整
3.0 通过节点和深度筛选gfa
cd 1_Trimmed_Reads/spades
blastn -query ref.fa -subject assembly_graph.fastg -outfmt 6 -out blast.log
获得node序号,写入id.txt
perl /share/nas6/xul/program/mt2/assembly/src/filter_node_from_fastg.pl -g assembly_graph.fastg -i id.txt -d 99(深度)
将pe150 reads连接成更长的reads
/share/nas6/zhouxy/biosoft/pandaseq/current/bin/pandaseq -f map_pair_hits.1.fq -r map_pair_hits.2.fq > merge.fa 2> log
3.1 批量运行
## 查看起点
for i in ref_ann/fasta/*.fasta*;do nuc $i *.fsa -l 1 && awk 'NR==4 {print $3}' out.delta ;done
## 批量移动finish
mkdir fsa && for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/finish/$i"_FULLCP.fsa" fsa/$i"_FULLCP.fsa";done
## org 批量移动3_Spades_Assembly的gfa
mkdir org && for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/pseudo/$i/3_Spades_Assembly/spades_iter1/extended_K121.assembly_graph.fastg org/$i"_K121.assembly.fastg" || cp analysis/assembly/$i/pseudo/$i/3_Spades_Assembly/spades_iter1/extended_spades/assembly_graph.fastg org/$i"_assembly_graph.fastg";done
## 批量复制org结果
mkdir org_result
for i in `ls analysis/assembly/`;do echo $i;cp analysis/assembly/$i/pseudo/$i/3_Spades_Assembly/spades_iter1/embplant_pt.K121.complete.graph1.1.path_sequence.fasta org_result/$i.fasta;done
cd org_result && for i in *fasta;do filename=$(basename $i .fasta) && sed -i "1s/.*/>$filename/" $i;done && rename .fasta _FULLCP.fsa * # 修改第一行ID
## 批量根据参考的ssc方向矫正组装结果 串行
cp_add.py -i ref.fa -p 107371-125499:+ -sn refssc.fa
for i in org_result/*.fasta;do cp_add.py -i $i -p $(ir $i | grep SSC | awk -F ":" '{print $2}'):+ -sn ssc.fa 1>>log1 2>>log2 ; nuc refssc.fa ssc.fa && if [ $(awk 'NR==4 {print $3}' out.delta) -gt $(awk 'NR==4 {print $4}' out.delta) ]; then echo $i && cp_ssc.pl -i $i -o $i ;fi ;done
## 并行矫正
cp_add.py -i ref.fa -p 107371-125499:+ -sn refssc.fa
for i in *.fasta; do (cp_add.py -i $i -p $(perl /share/nas6/xul/program/chloroplast/scope/yelvti_sc.pl -i $i | grep SSC | awk -F ":" '{print $2}'):+ -sn $i.ssc.fa 1>>log1 2>>log2; nuc ../refssc.fa $i.ssc.fa -p $i && (if [ $(awk 'NR==4 {print $3}' $i.delta) -gt $(awk 'NR==4 {print $4}' $i.delta) ] ; then echo $i && perl /share/nas6/xul/program/chloroplast/bin/cp_ssc.pl -i $i -o $i; fi) )& done
# 批量把调整好的结果放回finish
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;mkdir analysis/assembly/$i/finish -p ;cp org_result/${i}_FULLCP.fsa analysis/assembly/$i/finish;done
## 批量展示流程结果
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;ll analysis/assembly/$i/pseudo/$i/Final_Assembly/$i"_FULLCP.fsa" || ll analysis/assembly/$i/pseudo/$i/Final_Assembly/$i"_afin_iter2.fa" || ll analysis/assembly/$i/pseudo/$i/4_Afin_Assembly/filtered_spades_contigs.fsa;done
## spa 批量移动3_Spades_Assembly的gfa
mkdir spa && for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/pseudo/$i/3_Spades_Assembly/spades_iter1/assembly_graph.fastg spa/$i"_assembly_graph.fastg";done
## 批量查看bowtie的reads数
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cd analysis/assembly/$i/pseudo/$i/2_Bowtie_Mapping && showline- && cd -;done
# 批量2_Bowtie_Mapping cut
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cd analysis/assembly/$i/pseudo/$i/2_Bowtie_Mapping && head -n 300000 map_pair_hits.1.fq > cut.1.fq && head -n 300000 map_pair_hits.2.fq > cut.2.fq && cd -;done
# 批量unicycler
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cd analysis/assembly/$i/pseudo/$i/2_Bowtie_Mapping && echo `pwd`; unicycler.py -1 *.1.fq -2 *.2.fq -o uni & cd -;done
# 批量移动unicycler的gfa
mkdir uni && for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/pseudo/$i/2_Bowtie_Mapping/uni/002_overlaps_removed.gfa uni/$i"_002_overlaps_removed.gfa";done
1.批量bowtie
for i in `ls analysis/assembly/`;do echo $i; cd analysis/assembly/$i/pseudo/$i/ && cp_bowtie.align.pl -i $i.fa* -1 1_Trimmed_Reads/*P1.fq -2 1_Trimmed_Reads/*P2.fq -o bowtie & done
2.全部check
for i in $( ls /share/nas1/yuj/project/202201/GP-20211206-3816/analysis/assembly) ;do echo $i; cd /share/nas1/yuj/project/202201/GP-20211206-3816/analysis/assembly/$i/finish && check -1 ../pseudo/$i/4_Afin_Assembly/bowtie/*1.fq -2 ../pseudo/$i/4_Afin_Assembly/bowtie/*2.fq -k 125 -m 3 -g *.fsa -o check;done
3.批量check
for i in {Mm_J3,Mm_L1,Mm_L2,Mm_L3,Mm_S1,Mm_S2,Mm_S3};do echo $i;cd /share/nas1/yuj/project/202201/GP-20211206-3816/analysis/assembly/$i/pseudo/$i/4_Afin_Assembly/ && check -1 bowtie/*1.fq -2 bowtie/*2.fq -k 125 -m 3 -g fil*.fsa -o check ;done
3.2 100次迭代
cd analysis/assembly/*/pseudo/*/
for i in {1..100} ;do echo $((i+1)) && ptGAUL.sh -r result$i/result_3000/ptGAUL_final_assembly/path1.fasta -l pass.fq.gz -t 8 -f 3000 -o result$((i+1)) ;done > log &
1.组装
1)使用unicycler
mkdir -p bow1/uni/ # 创建bow1文件夹,里面有/uni/assembly.fasta
for i in {1..100} ;do echo $((i+1)) && cp_bowtie.align.pl -i bow$i/uni/assembly.fasta -1 1_Trimmed_Reads/*trimmed_P1.fq -2 1_Trimmed_Reads/*trimmed_P2.fq -o bow$((i+1)) && unicycler.py -1 bow$((i+1))/*1.fq -2 bow$((i+1))/*2.fq -o bow$((i+1))/uni --kmers 21,55,127 > log2 ;done & # 指定kmer
for i in {1..100} ;do echo $((i+1)) && cp_bowtie.align.pl -i bow$i/uni/assembly.fasta -1 1_Trimmed_Reads/*trimmed_P1.fq -2 1_Trimmed_Reads/*trimmed_P2.fq -o bow$((i+1)) && cd bow$((i+1)) && uni- > log2 && cd - ;done & # 不指定kmer
2)使用spades
mkdir -p bow1/spades/
for i in {1..100} ;do echo $((i+1)) && cp_bowtie.align.pl -i bow$i/spades/scaffolds.fasta -1 1_Trimmed_Reads/*trimmed_P1.fq -2 1_Trimmed_Reads/*trimmed_P2.fq -o bow$((i+1)) && cd bow$((i+1)) && spa- && cd - ;done > 100tmp.log &
----监控文件夹数量
sh /share/nas1/yuj/script/mitochondrion/assembly/monitor.sh &
2.运行完毕
mkdir uni
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/pseudo/$i/bow101/uni/002_overlaps_removed.gfa uni/$i"_002_overlaps_removed.gfa";done
3.3 使用延伸程序
序列延伸:
长序列:perl /share/nas6/xul/program/mt2/assembly/src/extend.auto.pl -i1 target.fa(组装序列) -i2 all.fq/fa(原始数据r1、r2各运行一次)
短序列:perl /share/nas6/xul/program/mt2/assembly/src/extend.auto.pl -i1 target.fa -i2 all.fq/fa -sr #优先运行这个
延伸后序列替换:
perl /share/nas6/xul/program/mt2/assembly/src/extend.pl -t target.fa(组装序列) -f _extend.map.fa -c _out.coords -id 999(auto程序展示结果中的匹配reads)
3.4 手动步骤
3.4.1 bowtie
$ cp_bowtie.align.pl -i allseq.fa -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq -o bowtie &
$ cp_bowtie.align.pl -i spa.fsa -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq -o bowtie &
$ cp_bowtie.align.pl -i test.fa -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq -o bowtie &
$ cp_bowtie.align.pl -i filtered_spades_contigs.fsa -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq -o bowtie &
$ showline-
$ head -n 1000 map_pair_hits.1.fq > cut.1.fq && head -n 1000 map_pair_hits.2.fq > cut.2.fq
$ unicycler.py -1 cut.1.fq -2 cut.2.fq -o unicycler
3.4.2 check
$ check -1 bowtie/*1.fq -2 bowtie/*2.fq -k 125 -m 3 -g test.fa -o check &
$ check -1 bowtie/*1.fq -2 bowtie/*2.fq -k 125 -m 3 -g filtered_spades_contigs.fsa -o check &
$ check -1 ../2_Bowtie_Mapping/*1.fq -2 ../2_Bowtie_Mapping/*2.fq -k 125 -m 3 -g test.fa -o check &
$ check -1 ../2_Bowtie_Mapping/*1.fq -2 ../2_Bowtie_Mapping/*2.fq -k 125 -m 3 -g filtered_spades_contigs.fsa -o check &
3.4.3 correct
$ correct -i1 check/K125.scaffold.fa -i2 check/tmp/K125.dump -s 125 -p K125 -o correct
3.4.4 tie
4.tie
# 对原来的test.fa末尾tie
$ tie -i check/K125.scaffold.fa -d check/tmp/K125.dump -k 125 -tp 146000 -p K125 -o tie
$ tie -i test.fa -d check/tmp/K125.dump -k 125 -tp 15400 -p K125 -o tie
# result —— tie/K125.extend.fa
5.ssc
$ cp_ssc.pl -i test.fa -o re_ssc.fa
四.组装软件
4.1unicycler
unicycler.py -1 *1.fq -2 *2.fq -o uni
4.2spades
spades.py --pe1-1 *1.fq --pe1-2 *2.fq --careful -o spades -k 35,45,55,75,97,107
spades.py --pe1-1 *1.fq.gz --pe1-2 *2.fq.gz -t 200 -k 97,107,117,127 -m 600 --careful --phred-offset 33 -o spades
# 全部数据组装 kmer可以设置的长一些
nohup spades.py --pe1-1 *1.fq --pe1-2 *2.fq -o spades -k 55 -m 2000 --careful # 一些不好组装的项目
4.3Getorganelle
# 路径 /share/nas1/yuj/software/miniconda3/envs/getorganelle/bin/get_organelle_from_reads.py
# 其他参数
-R 15 -k 21,45,65,85,105,115 -s 参考
1.常规使用
/share/nas1/yuj/software/miniconda3/envs/getorganelle/bin/get_organelle_from_reads.py -1 *1.fq -2 *2.fq -o org -F embplant_pt
get_organelle_from_reads.py -1 *1.fq -2 *2.fq -o org -F embplant_pt
# 更快的方法
/share/nas1/yuj/software/miniconda3/envs/getorganelle/bin/get_organelle_from_reads.py -1 *1.fq -2 *2.fq -o org -F embplant_pt --fast -w 0.68(可不设置)
2.使用已有fastg组装
/share/nas1/yuj/software/miniconda3/envs/getorganelle/bin/get_organelle_from_reads.py -g assembly_graph.fastg -F embplant_pt -o org
4.4 串行运行
unicycler.py -1 *1.fq -2 *2.fq -o uni && get_organelle_from_reads.py -1 *1.fq -2 *2.fq -o org -F animal_mt && spades.py --pe1-1 *1.fq --pe1-2 *2.fq --careful -o spades
五.下载参考(后接注释)
perl /share/nas6/xul/program/chloroplast/bin/cp_get_genbank_form_ncbi_with_ACCESSION.pl
叶绿体 以前的记录
python3 /share/nas1/yuj/cp/get_ass_cfg.py -i ?????/2.clean_data #改物种名
nohup perl /share/nas6/pub/pipline/genome-assembly-seq/mitochondrial-genome-seq/v2.0/assembly/mt.assembly.pip.pl -i ass.cfg &
程序暂停 fg一直不好用 bg
cp_bowtie.align.pl -i test.fa -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq -o bowtie &
----寻找组装用参考
cp 备份一下为test.fa,复制到ncbi找参考
perl /share/nas6/xul/program/chloroplast/bin/cp_get_genbank_form_ncbi_with_ACCESSION.pl 下载参考
一.重新延伸组装
1.先画图看看,大致了解
nuc xxxx.fasta test.fa
mummerplot out.delta
return test.fa
2.生成bowtie文件夹(可选,建议生成)
cp_bowtie.align.pl -i test.fa -1 ../1_Trimmed_Reads/xxxx.trimmed_P1.fq -2 ../1_Trimmed_Reads/xxxx.trimmed_P2.fq -o bowtie &
耗时比较久,大约10多分钟
return bowtie/
3.开始检查,生成kmer库, 用bowtie/(优先建议),../1_Trimmed_Reads/(建议),../2_Bowtie_Mapping/,均可
check -1 ../1_Trimmed_Reads/xxxx.trimmed_P1.fq -2 ../1_Trimmed_Reads/xxxx.trimmed_P2.fq -k 125 -m 3 -g test.fa -o check & #m为最小深度,选3 5 9 19等
return check/tmp/K125.dump
4.加载kmer库延伸
从ref截一段去kmer库找到kmer0TTTT
cp_extend_with_dump3.pl -i check/tmp/K125.dump -s TTTT可以加入手动选择参数 -c
vim _extend3.fa删除125个
return _extend3.fa
5.画图比对删改调整
nucmer .../fasta/xxxx.fasta _extend3.fa
mummerplot out.delta
修改至成环
return cir_extend.fa
6.调整起点,用ir查看结构
ir cir_extend.fa 采用lsc做开头
return cir_extend.fa
7.检查结果
check -1 ../1_Trimmed_Reads/xxxx.trimmed_P1.fq -2 ../1_Trimmed_Reads/xxxx.trimmed_P2.fq -k 125 -m 3 -g cir_extend.fa -o check2 &
check -d check/tmp/K125.dump -k 125 -m 3 -g cir_extend.fa -o check2 &第2步生成了kmer库,也可以直接用它
检查scaffold和覆盖度
return cir_extend.fa
8.结束,创建finish文件夹
xxxx_FULLCP.fsa
二.使用已有结果修补
1.以物种9-Phalaenopsis_stobartiana为例
cp Phalaenopsis_stobartiana_afin_iter2.fa test.fa
nucmer ../../../../Phalaenopsis_stobartiana/pseudo/ref/fasta/MW531729.1.fasta test.fa
mummerplot out.delta
对序列进行操作,切一个环出来
return test.fa
2.生成bowtie文件夹(可选,建议)
cp_bowtie.align.pl -i test.fa -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq -o bowtie &
耗时比较久,大约10多分钟
return bowtie/
3.利用bowtie里的俩文件进行检查,或者利用../2_Bowtie_Mapping/里的来check(注意:../2_Bowtie_Mapping/这里面是18年以前的物种,用的话有可能漏掉reads)
check -1 bowtie/*1.fq -2 bowtie/*2.fq -k 125 -m 3 -g test.fa -o check &
找check里的scaffold,看看有没有N,本例中间末尾均有n
查看k125覆盖情况即kmer深度,若中间有N,则需要查看,末尾有,不看也行
大于70一般没问题,出现突降突升则有问题,寻找0-7的位置,/\t0\.[0-7]
覆盖深度低的地方全部替换为N,可以多换点,得到新的scaffold2
return check/K125.scaffold.fa
4.利用correct程序修补中间N
correct -i1 check/K125.scaffold.fa -i2 check/tmp/K125.dump -s 125 -p K125 -o correct
生成了correct,得到新补的gapfiller,中间没有gap了
找correct里的K125.iteration1.gapfilled.log查看
fastalength K125.gapfiller_finish.fa
return correct/K125.gapfiller_finish.fa
5.用tie延伸(针对末尾有N的情况)
tie -i check/K125.scaffold.fa -d check/tmp/K125.dump -k 125 -tp 146000 -p K125 -o tie
进到tie文件夹,查看下
打开log文件找warn的位置,有可能出现分支,物种9没有分支
查看extend.fa的长度,有可能过长
return tie/extend.fa
6.tie延伸较长的解决办法
方法一:进行序列的反向互补,反向互补后的序列再tie延伸
方法二:
1.打开NCBI,粘贴进去,点击等待结果
2.比对完,下载参考序列放ref文件夹里,顺便挪到pseudo下
画参考图,找到最后面的正确位置 145895
找到145896处,后面全去掉,包括145896的T
3.用tie延伸一下,选择140000开始,本例延伸后依旧过长
得到新的extend与kmer库check一下得到check3文件夹
然后用最新的scaffold,此处为最终结果
return check3/K125.scaffold.fa
7.创建finish文件夹,改名,注意格式
mkdir ../../../finish
cp check3/K125.scaffold.fa ../../../finish/Phalaenopsis_stobartiana_FULLMT.fsa