04叶绿体流程1-组装

一.前期准备

数据位置
/share/nas1/seqloader/yelvti/GP-xxxx_yelvti/2.clean_data

#先看污染比对结果!!!!有问题直接先反馈!!!!
1.污染比对
perl /share/nas6/zhouxy/functional_modules/pollution_nt_blast/pollution_nt_blast_pip_v3.pl -fqdir 路径/2.clean_data -od 输出目录

2.得到ass文件,改成拉丁名,下划线
python3 /share/nas1/yuj/script/chloroplast/get_ass_cfg.py -i 路径(邮件分发) #会自动补上2.clean_data

二.组装

数据库位置

/share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/assembly/dat/cpdb/plastid.genome.sequence.fasta

nohup perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/assembly/cp.assembly.pip.pl -i ass.cfg & # 组装核心流程
nohup perl /share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/assembly/cp.assembly.pip3.pl -i ass.cfg & ???没修改好,不要用
当bowtie超过指定大小立马组装

以下脚本为:
当bowtie超过50M或者3_Spades_Assembly已经创建,切断流程,进行组装

#!/bin/bash

while true; do
    if  [ -d "analysis/assembly/"*"/pseudo/"*"/3_Spades_Assembly" ]; then
        echo "文件夹 '3_Spades_Assembly' 已创建"
        break  # 跳出循环
    fi
    if [ -f "analysis/assembly/"*"/pseudo/"*"/2_Bowtie_Mapping/map_pair_hits.1.fq" ]; then
        if ((`stat -c %s "analysis/assembly/"*"/pseudo/"*"/2_Bowtie_Mapping/map_pair_hits.1.fq"` > 50 * 1024 * 1024)); then
            echo "文件 'map_pair_hits.1.fq' 大小超过50M"
            break  # 跳出循环
        fi
    fi
    sleep 30  # 等待一段时间后再次进行检查
done

echo "条件满足,可以继续执行后续命令"

# 获取之前的进程ID
previous_pid=$(pgrep -f "/share/nas6/pub/pipline/genome-assembly-seq/chloroplast-genome-seq/v1.2/assembly/cp.assembly.pip.pl")

# 终止之前的进程
if [ -n "$previous_pid" ]; then
    kill "$previous_pid"
fi

cd "analysis/assembly/"*"/pseudo/"*"/2_Bowtie_Mapping"
head -n 500000 map_pair_hits.1.fq > cut.1.fq
head -n 500000 map_pair_hits.2.fq > cut.2.fq
unicycler.py -1 cut.1.fq -2 cut.2.fq -o uni

三.组装调整

## 查看起点
for i in ref_ann/fasta/*.fasta*;do nuc  $i  *.fsa  -l 1 && awk 'NR==4 {print $3}' out.delta ;done

3.0 通过节点和深度筛选gfa

cd 1_Trimmed_Reads/spades

blastn -query ref.fa -subject assembly_graph.fastg -outfmt 6 -out blast.log

获得node序号,写入id.txt

perl /share/nas6/xul/program/mt2/assembly/src/filter_node_from_fastg.pl -g assembly_graph.fastg  -i id.txt -d 99(深度)


将pe150 reads连接成更长的reads
/share/nas6/zhouxy/biosoft/pandaseq/current/bin/pandaseq  -f map_pair_hits.1.fq  -r map_pair_hits.2.fq  > merge.fa  2> log

3.1 批量运行

## 批量移动finish
mkdir fsa && for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/finish/$i"_FULLCP.fsa" fsa/$i"_FULLCP.fsa";done

## org 批量移动3_Spades_Assembly的gfa
mkdir org && for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/pseudo/$i/3_Spades_Assembly/spades_iter1/extended_K121.assembly_graph.fastg org/$i"_K121.assembly.fastg" || cp analysis/assembly/$i/pseudo/$i/3_Spades_Assembly/spades_iter1/extended_spades/assembly_graph.fastg org/$i"_assembly_graph.fastg";done


## 批量复制org结果
mkdir org_result
for i in `ls analysis/assembly/`;do echo $i;cp analysis/assembly/$i/pseudo/$i/3_Spades_Assembly/spades_iter1/embplant_pt.K121.complete.graph1.1.path_sequence.fasta  org_result/$i.fasta;done
cd org_result && for i in *fasta;do filename=$(basename $i .fasta) && sed -i "1s/.*/>$filename/" $i;done && rename .fasta  _FULLCP.fsa  *

## 批量根据参考的ssc方向矫正组装结果  串行
cp_add.py -i ref.fa  -p 107371-125499:+  -sn refssc.fa
for i in org_result/*.fasta;do cp_add.py  -i $i -p  $(ir $i  | grep  SSC | awk -F ":"  '{print $2}'):+   -sn ssc.fa 1>>log1 2>>log2 ; nuc refssc.fa ssc.fa &&   if [ $(awk 'NR==4 {print $3}' out.delta) -gt $(awk 'NR==4 {print $4}' out.delta) ]; then echo $i &&  cp_ssc.pl   -i $i  -o  $i ;fi  ;done

## 并行矫正
cp_add.py -i ref.fa  -p 107371-125499:+  -sn refssc.fa
for i in *.fasta; do (cp_add.py -i $i -p $(perl /share/nas6/xul/program/chloroplast/scope/yelvti_sc.pl -i $i | grep SSC | awk -F ":" '{print $2}'):+ -sn $i.ssc.fa 1>>log1 2>>log2;   nuc ../refssc.fa $i.ssc.fa -p $i  && (if [ $(awk 'NR==4 {print $3}' $i.delta) -gt $(awk 'NR==4 {print $4}' $i.delta) ] ; then     echo $i &&  perl  /share/nas6/xul/program/chloroplast/bin/cp_ssc.pl -i $i -o $i;   fi) )& done

# 批量把调整好的结果放回finish
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;mkdir analysis/assembly/$i/finish -p ;cp org_result/${i}_FULLCP.fsa analysis/assembly/$i/finish;done


## 批量展示流程结果
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;ll  analysis/assembly/$i/pseudo/$i/Final_Assembly/$i"_FULLCP.fsa"  ||  ll analysis/assembly/$i/pseudo/$i/Final_Assembly/$i"_afin_iter2.fa" || ll analysis/assembly/$i/pseudo/$i/4_Afin_Assembly/filtered_spades_contigs.fsa;done

## spa 批量移动3_Spades_Assembly的gfa
mkdir spa && for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/pseudo/$i/3_Spades_Assembly/spades_iter1/assembly_graph.fastg spa/$i"_assembly_graph.fastg";done

## 批量查看bowtie的reads数
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cd analysis/assembly/$i/pseudo/$i/2_Bowtie_Mapping && showline- && cd -;done

# 批量2_Bowtie_Mapping cut
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cd analysis/assembly/$i/pseudo/$i/2_Bowtie_Mapping && head -n 300000 map_pair_hits.1.fq > cut.1.fq && head -n 300000 map_pair_hits.2.fq > cut.2.fq && cd -;done

# 批量unicycler
for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cd analysis/assembly/$i/pseudo/$i/2_Bowtie_Mapping && echo `pwd`; unicycler.py -1 *.1.fq -2 *.2.fq -o uni &  cd -;done

# 批量移动unicycler的gfa
mkdir uni && for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/pseudo/$i/2_Bowtie_Mapping/uni/002_overlaps_removed.gfa uni/$i"_002_overlaps_removed.gfa";done


1.批量bowtie
for i in `ls analysis/assembly/`;do echo $i; cd analysis/assembly/$i/pseudo/$i/ && cp_bowtie.align.pl -i  $i.fa*  -1 1_Trimmed_Reads/*P1.fq -2 1_Trimmed_Reads/*P2.fq  -o bowtie  & done

2.全部check
for i in $( ls /share/nas1/yuj/project/202201/GP-20211206-3816/analysis/assembly) ;do echo $i; cd /share/nas1/yuj/project/202201/GP-20211206-3816/analysis/assembly/$i/finish && check -1 ../pseudo/$i/4_Afin_Assembly/bowtie/*1.fq -2 ../pseudo/$i/4_Afin_Assembly/bowtie/*2.fq  -k 125 -m 3 -g *.fsa -o check;done

3.批量check
for i in {Mm_J3,Mm_L1,Mm_L2,Mm_L3,Mm_S1,Mm_S2,Mm_S3};do echo $i;cd /share/nas1/yuj/project/202201/GP-20211206-3816/analysis/assembly/$i/pseudo/$i/4_Afin_Assembly/ && check -1 bowtie/*1.fq -2 bowtie/*2.fq  -k 125 -m 3 -g fil*.fsa -o check ;done

3.2 100次迭代

cd analysis/assembly/*/pseudo/*/

for i in {1..100} ;do echo $((i+1)) && ptGAUL.sh -r result$i/result_3000/ptGAUL_final_assembly/path1.fasta  -l  pass.fq.gz  -t 8 -f 3000 -o  result$((i+1)) ;done   > log  &

1.组装
1)使用unicycler
mkdir -p bow1/uni/ # 创建bow1文件夹,里面有/uni/assembly.fasta
for i in {1..100} ;do echo $((i+1)) && cp_bowtie.align.pl -i bow$i/uni/assembly.fasta -1 1_Trimmed_Reads/*trimmed_P1.fq -2 1_Trimmed_Reads/*trimmed_P2.fq -o bow$((i+1)) && unicycler.py -1 bow$((i+1))/*1.fq -2 bow$((i+1))/*2.fq -o bow$((i+1))/uni --kmers 21,55,127 > log2 ;done & # 指定kmer

for i in {1..100} ;do echo $((i+1)) && cp_bowtie.align.pl -i bow$i/uni/assembly.fasta -1 1_Trimmed_Reads/*trimmed_P1.fq -2 1_Trimmed_Reads/*trimmed_P2.fq -o bow$((i+1)) && cd bow$((i+1)) && uni-  > log2 && cd - ;done & # 不指定kmer

2)使用spades
mkdir -p bow1/spades/
for i in {1..100} ;do echo $((i+1)) && cp_bowtie.align.pl -i bow$i/spades/scaffolds.fasta -1 1_Trimmed_Reads/*trimmed_P1.fq -2 1_Trimmed_Reads/*trimmed_P2.fq -o bow$((i+1)) && cd bow$((i+1)) && spa- && cd - ;done > 100tmp.log &

----监控文件夹数量
sh /share/nas1/yuj/script/mitochondrion/assembly/monitor.sh &

2.运行完毕
mkdir uni

for i in `ls analysis/assembly/`;do echo $i---------------------------------------------------------------------------------------------------------------------------------------------------------;cp analysis/assembly/$i/pseudo/$i/bow101/uni/002_overlaps_removed.gfa uni/$i"_002_overlaps_removed.gfa";done

3.3 使用延伸程序


序列延伸:
  长序列:perl /share/nas6/xul/program/mt2/assembly/src/extend.auto.pl -i1 target.fa(组装序列) -i2 all.fq/fa(原始数据r1、r2各运行一次) 
  短序列:perl /share/nas6/xul/program/mt2/assembly/src/extend.auto.pl -i1 target.fa -i2 all.fq/fa -sr  #优先运行这个

延伸后序列替换:
  perl /share/nas6/xul/program/mt2/assembly/src/extend.pl  -t target.fa(组装序列) -f _extend.map.fa -c _out.coords   -id  999(auto程序展示结果中的匹配reads)

3.4 手动步骤

3.4.1 bowtie

$ cp_bowtie.align.pl -i allseq.fa  -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq  -o bowtie &
$ cp_bowtie.align.pl -i spa.fsa  -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq  -o bowtie &
$ cp_bowtie.align.pl -i test.fa  -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq  -o bowtie &
$ cp_bowtie.align.pl -i filtered_spades_contigs.fsa  -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq  -o bowtie &

$ showline-
$ head -n 1000 map_pair_hits.1.fq > cut.1.fq && head -n 1000 map_pair_hits.2.fq > cut.2.fq
$ unicycler.py -1 cut.1.fq -2 cut.2.fq -o unicycler

3.4.2 check

$ check -1 bowtie/*1.fq -2 bowtie/*2.fq  -k 125 -m 3 -g test.fa -o check &
$ check -1 bowtie/*1.fq -2 bowtie/*2.fq  -k 125 -m 3 -g filtered_spades_contigs.fsa -o check &
$ check -1 ../2_Bowtie_Mapping/*1.fq -2 ../2_Bowtie_Mapping/*2.fq  -k 125 -m 3 -g test.fa -o check &
$ check -1 ../2_Bowtie_Mapping/*1.fq -2 ../2_Bowtie_Mapping/*2.fq  -k 125 -m 3 -g filtered_spades_contigs.fsa -o check &

3.4.3 correct

$ correct -i1 check/K125.scaffold.fa  -i2 check/tmp/K125.dump  -s 125 -p K125 -o correct

3.4.4 tie

4.tie
# 对原来的test.fa末尾tie
$ tie -i check/K125.scaffold.fa -d check/tmp/K125.dump  -k 125  -tp 146000 -p K125 -o tie
$ tie -i test.fa -d check/tmp/K125.dump  -k 125  -tp 15400 -p K125 -o tie
# result —— tie/K125.extend.fa

5.ssc
$ cp_ssc.pl -i test.fa  -o re_ssc.fa

四.组装软件

4.1unicycler

unicycler.py -1 *1.fq -2 *2.fq -o uni

4.2spades

spades.py --pe1-1 *1.fq --pe1-2 *2.fq --careful  -o spades               -k 35,45,55,75,97,107

spades.py --pe1-1 *1.fq.gz --pe1-2 *2.fq.gz -t 200 -k 97,107,117,127 -m 600 --careful --phred-offset 33 -o spades
# 全部数据组装  kmer可以设置的长一些

 nohup spades.py --pe1-1 *1.fq --pe1-2 *2.fq -o spades -k 55 -m 2000 --careful # 一些不好组装的项目

4.3Getorganelle

# 路径 /share/nas1/yuj/software/miniconda3/envs/getorganelle/bin/get_organelle_from_reads.py

# 其他参数  
-R 15 -k 21,45,65,85,105,115 -s 参考
1.常规使用
/share/nas1/yuj/software/miniconda3/envs/getorganelle/bin/get_organelle_from_reads.py -1 *1.fq -2 *2.fq -o org -F embplant_pt
get_organelle_from_reads.py -1 *1.fq -2 *2.fq -o org -F embplant_pt

# 更快的方法	
/share/nas1/yuj/software/miniconda3/envs/getorganelle/bin/get_organelle_from_reads.py -1 *1.fq -2 *2.fq -o org -F embplant_pt --fast                   -w 0.68(可不设置)

2.使用已有fastg组装
/share/nas1/yuj/software/miniconda3/envs/getorganelle/bin/get_organelle_from_reads.py -g assembly_graph.fastg -F embplant_pt -o org

4.4 串行运行

unicycler.py -1 *1.fq -2 *2.fq -o uni && get_organelle_from_reads.py -1 *1.fq -2 *2.fq -o org -F animal_mt && spades.py --pe1-1 *1.fq --pe1-2 *2.fq --careful  -o spades

五.下载参考(后接注释)

perl /share/nas6/xul/program/chloroplast/bin/cp_get_genbank_form_ncbi_with_ACCESSION.pl

叶绿体 以前的记录

python3 /share/nas1/yuj/cp/get_ass_cfg.py -i ?????/2.clean_data #改物种名
nohup perl /share/nas6/pub/pipline/genome-assembly-seq/mitochondrial-genome-seq/v2.0/assembly/mt.assembly.pip.pl -i ass.cfg &
程序暂停  fg一直不好用  bg
cp_bowtie.align.pl -i test.fa  -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq  -o bowtie &


----寻找组装用参考
cp   备份一下为test.fa,复制到ncbi找参考
perl /share/nas6/xul/program/chloroplast/bin/cp_get_genbank_form_ncbi_with_ACCESSION.pl 下载参考


一.重新延伸组装
1.先画图看看,大致了解
nuc xxxx.fasta  test.fa
mummerplot out.delta
return test.fa

2.生成bowtie文件夹(可选,建议生成)
cp_bowtie.align.pl -i test.fa  -1 ../1_Trimmed_Reads/xxxx.trimmed_P1.fq -2 ../1_Trimmed_Reads/xxxx.trimmed_P2.fq  -o bowtie &
耗时比较久,大约10多分钟
return bowtie/

3.开始检查,生成kmer库, 用bowtie/(优先建议),../1_Trimmed_Reads/(建议),../2_Bowtie_Mapping/,均可
check -1 ../1_Trimmed_Reads/xxxx.trimmed_P1.fq -2 ../1_Trimmed_Reads/xxxx.trimmed_P2.fq  -k 125 -m 3 -g test.fa -o check & #m为最小深度,选3 5 9 19等
return check/tmp/K125.dump

4.加载kmer库延伸
从ref截一段去kmer库找到kmer0TTTT
cp_extend_with_dump3.pl -i check/tmp/K125.dump  -s TTTT可以加入手动选择参数 -c
vim _extend3.fa删除125个
return _extend3.fa

5.画图比对删改调整
nucmer .../fasta/xxxx.fasta  _extend3.fa
mummerplot out.delta 
修改至成环
return cir_extend.fa

6.调整起点,用ir查看结构
ir cir_extend.fa 采用lsc做开头
return cir_extend.fa

7.检查结果
check -1 ../1_Trimmed_Reads/xxxx.trimmed_P1.fq -2 ../1_Trimmed_Reads/xxxx.trimmed_P2.fq  -k 125 -m 3 -g cir_extend.fa  -o check2 &
check -d check/tmp/K125.dump  -k 125 -m 3 -g cir_extend.fa -o check2 &第2步生成了kmer库,也可以直接用它
检查scaffold和覆盖度
return cir_extend.fa

8.结束,创建finish文件夹
xxxx_FULLCP.fsa 




二.使用已有结果修补
​1.以物种9-Phalaenopsis_stobartiana为例
​cp Phalaenopsis_stobartiana_afin_iter2.fa test.fa
​nucmer ../../../../Phalaenopsis_stobartiana/pseudo/ref/fasta/MW531729.1.fasta  test.fa
​mummerplot out.delta 
​对序列进行操作,切一个环出来
​return test.fa
​
​2.生成bowtie文件夹(可选,建议)
​cp_bowtie.align.pl -i test.fa  -1 ../1_Trimmed_Reads/*P1.fq -2 ../1_Trimmed_Reads/*P2.fq  -o bowtie &
​耗时比较久,大约10多分钟
​return bowtie/
​
​3.利用bowtie里的俩文件进行检查,或者利用../2_Bowtie_Mapping/里的来check(注意:../2_Bowtie_Mapping/这里面是18年以前的物种,用的话有可能漏掉reads)
​check -1 bowtie/*1.fq -2 bowtie/*2.fq  -k 125 -m 3 -g test.fa -o check &
​找check里的scaffold,看看有没有N,本例中间末尾均有n
​查看k125覆盖情况即kmer深度,若中间有N,则需要查看,末尾有,不看也行
​大于70一般没问题,出现突降突升则有问题,寻找0-7的位置,/\t0\.[0-7]
​覆盖深度低的地方全部替换为N,可以多换点,得到新的scaffold2
​return check/K125.scaffold.fa
​
​4.利用correct程序修补中间N
​correct -i1 check/K125.scaffold.fa  -i2 check/tmp/K125.dump  -s 125 -p K125 -o correct
​生成了correct,得到新补的gapfiller,中间没有gap了
​找correct里的K125.iteration1.gapfilled.log查看
​fastalength K125.gapfiller_finish.fa
​return correct/K125.gapfiller_finish.fa
​
​5.用tie延伸(针对末尾有N的情况)
​tie -i check/K125.scaffold.fa -d check/tmp/K125.dump  -k 125  -tp 146000 -p K125 -o tie
​进到tie文件夹,查看下
​打开log文件找warn的位置,有可能出现分支,物种9没有分支
​查看extend.fa的长度,有可能过长
​return tie/extend.fa
​
​6.tie延伸较长的解决办法
​方法一:进行序列的反向互补,反向互补后的序列再tie延伸
​方法二:
​1.打开NCBI,粘贴进去,点击等待结果
​2.比对完,下载参考序列放ref文件夹里,顺便挪到pseudo下
​画参考图,找到最后面的正确位置  145895
​找到145896处,后面全去掉,包括145896的T
​3.用tie延伸一下,选择140000开始,本例延伸后依旧过长
​得到新的extend与kmer库check一下得到check3文件夹
​然后用最新的scaffold,此处为最终结果
​return check3/K125.scaffold.fa
​
​7.创建finish文件夹,改名,注意格式
​mkdir ../../../finish
​cp check3/K125.scaffold.fa  ../../../finish/Phalaenopsis_stobartiana_FULLMT.fsa