05线粒体流程4-真菌版

1️.MITOS注释

## for循环步骤
for i in $(ls analysis/annotation);do echo $i;cd analysis/annotation/$i &&  mkdir mitos2 && cd -;done
for i in $(ls analysis/annotation);do echo $i;cd analysis/annotation/$i && micromamba run -n mitos2 runmitos.py -i $i"_FULLMT.fsa"  -o mitos2/ -c 4 -R /share/nas1/yuj/software/mitos2_refdata/0.3_ref -r refseq63f --maxtrnaovl 100 --maxrrnaovl 100 --rrna 2 & done # 注意,这里是refseq63f

## 步骤
cd analysis/annotation/xxx  
mkdir mitos2
micromamba run -n mitos2 runmitos.py -i *.fsa -o mitos2/ -c 4 -R /share/nas1/yuj/software/mitos2_refdata/0.3_ref -r refseq63f --maxtrnaovl 100 --maxrrnaovl 100 --best --rrna 2

二.MITOS注释修改

2.1 解析注释结果

## for循环步骤
1)转换结果
for i in $(ls analysis/annotation);do echo $i;cd analysis/annotation/$i && python3 /share/nas1/yuj/script/mitochondrion/annotation/mt_mitos2info_table.py -i mitos2/result.mitos -fa $i"_FULLMT.fsa"  -o gene.info &&  python3 /share/nas1/yuj/script/fungi/annotation/fungi_parse_info_table.py -i gene.info -o gene.annotation.info2 -n 4 &&  cd - & done

2)看一下基因数量
for i in $(ls analysis/annotation);do echo $i;grep "Total" analysis/annotation/$i/gene.annotation.info2;done

2.2 修改注释

2.2.1 共线性好,用一个ref(推荐)

1.总览
count=0 && for i in $(grep 'CDS' gene.annotation.info2 | awk '{print $3 "_:_" $2}'); do count=$((count+1)) &&  echo "" && echo "###############" $count ":"  $(echo "$i" | awk -F "_:_" '{print $1}') "###############" && fu_add.py  -i *.fsa -p $(echo "$i" | awk -F "_:_" '{print $2}') -f ;done  > sample_ann.log

count=0 && for i in $(grep 'CDS' *.gbk.ann | awk '{print $3 "_:_" $2}'); do count=$((count+1)) &&  echo "" && echo "###############" $count ":"  $(echo "$i" | awk -F "_:_" '{print $1}') "###############" && fu_add.py  -i *.fasta  -p $(echo "$i" | awk -F "_:_" '{print $2}') -f ;done  > ref_ann.log

2.提取cds序列,挑选一个作为参考
cd ref_ann
python3 /share/nas1/yuj/script/fungi/annotation/from_gbk_get_cds.py -i gbk/ -o out  -d
ir.py -s7 -n 4 -i out/cds/cds_OP453745.1.fasta -o cds_OP453745.1.pep && cp out/cds/cds_OP453745.1.fasta ./

tblastn -query cds_OP453745.1.pep -subject  ../*.fsa -outfmt 6 | cut -f 1,7,8,9,10 > cds.pep.log
blastn -query cds_OP453745.1.fasta -subject  ../*.fsa -outfmt 6 | cut -f 1,7,8,9,10 > cds.fa.log
cd ../

3.进一步处理这些比对出的位置
## for循环直接全部处理了
cd ref_ann && for i in cds*log; do for j in {ATP6,ATP8,ATP9,COX1,COX2,COX3,CYTB,ND1,ND2,ND3,ND4,ND4L,ND5,ND6,RPS3}; do  echo ""; echo $j"-----------------------------------------"; grep -i  $j $i ; done > ../${i%.log}.txt ; done && cd ..
1	1481	6276	7756
1	1481	6276	7756
从结果里挑选最符合的

2.2.2 共线性有点差,把所有参考都用上(???)

先分组基因再blast
cd ref_ann
python3 /share/nas1/yuj/script/fungi/annotation/from_gbk_get_cds.py -i gbk/ -o out  -d
for i in out/cds/*;do grep -A 1 -i 'nd5'  $i >> nd5_all.fa;done

wc -l nd5_all.fa 
104 nd5_all.fa

for i in {1..104};do if [ $(($i%2)) -eq 0 ];then sed -n ""$i"p" nd5_all.fa > $i"_nd5";blastn -query $i"_nd5" -subject *.fsa -outfmt 6;fi;done # 挨个取出nd5,挨个blast
blastn -query cox1_all.fa -subject *.fsa -outfmt 6 > blastn_cox1 # 所有cox1一起blast
直接blast再解析(???)
mkdir ann_tmp
1.提取出cds及对应蛋白序列
python3 /share/nas1/yuj/script/fungi/annotation/from_gbk_get_cds.py -i ref_ann/gbk/ -o ref_ann/out  -d

???
cp ref_ann/out/cds/*.fasta ann_tmp/ && python3 /share/nas1/yuj/script/mitochondrion/phytree/mt_extract2mafft_V1.5.py -i ann_tmp/ -o1 gene/extract -o2 gene/mafft -c2
停顿几秒等比对完
cp *.fsa ann_tmp && cd ann_tmp
for i in cds*.fasta;do ir.py -s7 -n 5 -i $i -o ${i%.fasta}.pep;done # 无脊椎
for i in cds*.fasta;do ir.py -s7 -n 2 -i $i -o ${i%.fasta}.pep;done # 脊椎

2.利用核酸和蛋白序列blast比对
for i in cds*.fasta;do blastn -query $i  -subject *.fsa -outfmt 6 -max_hsps 1 > $i.log;done
for i in cds*.fasta;do tblastn -query ${i%.fasta}.pep -subject *.fsa -outfmt 6 -max_hsps 1 > ${i%.fasta}.pep.log;done
for i in cds_*.log;do echo $i;cut -f 1,7,8,9,10 $i > ${i%.log}.txt;done

3.进一步处理这些比对出的位置
## for循环直接全部处理了
for j in {COX1,COX2,CYTB,ATP8,ND4,ATP6,ND3,ND1,ND5,ND6,ND4L,ND2,COX3}; do echo $j;for i in cds*txt; do grep -i  $j $i ; done | cut -f 2,3,4,5 | sort -k 3 ;done > ../allcds.blast.txt && cd ..
1	1481	6276	7756
1	1481	6276	7756
从结果里挑选最符合的

## 比如这个是查找nd4基因
for i in cds*txt;do grep -i -v nd4l $i| grep -i nd4;done | cut -f 2,3,4,5 | sort -k 3

三.真菌分析

python3 /share/nas1/yuj/script/fungi/phytree/fungi_from_gbk_get_cds_V1.0.py -i gbk/  -o out  -d # -d表示不去重复
生成的cds文件格式如下:
>Ganoderma_sp._MN356101 [35773..35552] [gene=ATP9]