Commit 6f856d32 authored by Rebecca E Batorsky's avatar Rebecca E Batorsky
Browse files

modified pipeline script, test data and README

parent dbcbea48
......@@ -2,24 +2,25 @@
This is a repository to work on the error prone dna repair project with the McVey lab at Tufts University.
The script will be run like this on the command line, with a single argument that is the hifibr output file:
** Dependencies ***
Rscript process_hifibr.R TestData_HiFiBR_Output_mod.csv
Python 2.7
Python libraries: pandas
You need to have the tidyverse and Biostrings library installed.
R >=3.5
R libraries: tidyverse, Biostrings
It generates four output files in a directory like this:
** Pipeline Script **
This script takes one input: the path to the HiFibr output file
Directory: TestData_HiFiBR_Output_mod_output
Example usage on test data:
Files:
TestData_HiFiBR_Output_mod_reclassified.csv: same as input, but adds an “ID” column as well as a column for how the sequence was reclassified
TestData_HiFiBR_Output_mod_complex.txt: sequences that could not be reclassified as ins or del
TestData_HiFiBR_Output_mod_insertion.txt: all ins sequences
TestData_HiFiBR_Output_mod_deletion.txt: all del sequences with dashes
`sh run_pipeline.sh test_data/polyA1Seq/PolyA1Seq_testdata.csv`
I added a few more sequences to the test data, the last three lines of the file TestData_HiFiBR_Output_mod.csv, which are:
1. Reference sequence
2. One sequence with <10 reads
3. One sequence with both SNP and deletion, which ends in the complex category because it can’t be reclassified
It generates four output files in a directory `PolyA1Seq_testdata_output`
Files:
`PolyA1Seq_testdata_reclassified.csv` Same format input, but adds an “ID” column as well as a column for how the sequence was reclassified
`PolyA1Seq_testdata_complex.txt` sequences that could not be reclassified as ins or del
`PolyA1Seq_testdata_insertion.txt` all ins sequences
`PolyA1Seq_testdata_deletion.txt` all del sequences with dashes
\ No newline at end of file
No preview for this file type
......@@ -28,21 +28,6 @@ if (length(args)<5) {
}
#out_dir = "~/Documents/git/sdmmej/test_data/TestData_HiFiBR_Output_mod_output/"
#hifi_in = paste0(out_dir,"TestData_HiFiBR_Output_mod_reclassified.csv")
#insertion_in = paste0(out_dir,"TestData_HiFiBR_Output_mod_insertion.txt")
#nick=71
# out_dir = "~/Documents/git/sdmmej/test_data/expected_results/"
# hifi_in = "~/Documents/git/sdmmej/test_data/TestData_HiFiBR_Output_mod_output/TestData_HiFiBR_Output_mod_reclassified.csv"
# insertion_in = paste0(out_dir,"TestData_Insertions.csv")
# nick=71
# out_dir = "~/Box/rebecca_documents/sdmmej_data/"
# hifi_in = "~/Box/rebecca_documents/sdmmej_data/PolyA1Seq_reclassified.csv"
# insertion_in = "~/Box/rebecca_documents/sdmmej_data/PolyA1Seq_insertion.txt"
# nick=161
#
a<-read.csv(insertion_in)
## get reference
......@@ -65,7 +50,6 @@ R <- substr(ref,nick+1,nchar(ref))
if (substr(R,0,4) == 'TTAT'){
L = paste0(L,'TAT')
}
L
l <- nchar(L) # number of nucleotides of the left hand sequence
r <- nchar(R) # number of nucleotides of the right hand sequence
k1 <- 30 # how far you want to cut back to search, this needs to to be adjusted based on how large the deletions are, but i think this covers up to 30 bp of deletion on either side.
......@@ -81,13 +65,6 @@ for (i in a[, 1]){
a2[i] = lbb
}
### testing
#sigma <- nucleotideSubstitutionMatrix(match = 2, mismatch = -1, baseOnly = FALSE)
#seq=i
#align <- pairwiseAlignment(seq, ref, substitutionMatrix = sigma, gapOpening = -2,
# gapExtension = -8, scoreOnly = FALSE)
###
a3=NULL # create empty vector to insert RIGHT del boundary
for (i in a[, 1]){
......@@ -426,86 +403,3 @@ output3 <- paste0(out_dir, "/", plasmid, "testdata_insertion_alignment2.csv")
write.csv(master2, output1)
write.csv(test2, output2)
write.csv(pretty, output3)
### THE REST is not used
#
#
#
#
#
#
# #----------------------------------------
# # Went through insertions and had to manually change some indices.
# # Reload the consistency table and rerun consistency determination and alignment completion
# # NOTE: after going through the insertion consistency table, either delete the first collumn of just row numbers
# # and save, OR make sure to remove that first collumn. The first collumn of the "master2" data frame should be "ID"
#
# master <- read.csv("test_data_m6testdata_insertion_consistency2_curated.csv")
# master <- as.data.frame(master[,1:6])
#
# # if you dont have a5 already loaded...chances are you dont, rerun lines 13-52 to generate that data frame.
# # It is just easier that trying to dick around with formatting the "master" file.
#
# master2 <- merge(master, a5, by="ID", all.y=TRUE)
# master2$DRmotif_length <- master2$DR_END-master2$DR_START+1 # add repeat motif length for direct repeat
# master2$RCmotif_length <- master2$RC_END-master2$RC_START+1 # add repeat motif length for reverse complement
# master2[,2] <- ifelse(master2$DRmotif_length>=4,master2[,2] , NA) # consistency for either direct repeat/reverse complement
# master2[,4] <- ifelse(master2$RCmotif_length>=4,master2[,4] , NA)
# master2[,6] <- ifelse(!is.na(master2[,2]), "TRUE", ifelse(!is.na(master2[,4]), "TRUE", "FALSE" ))
# # Code below workds to give you "aligned" templates
# for (i in 1:nrow(master2)){
# if(!is.na(master2[i,2])){
# test <- substring(master2[i,10], first = master2[i,2], last = master2[i,3])
# test2 <- ifelse((as.numeric(master2[i,2])>as.numeric(master2[i,8])), (as.numeric(master2[i,2])+as.numeric(nchar(as.character(master2[i,11]))-1)), (as.numeric(master2[i,2])-1))
# test3 <- rep.int("-", times=test2)
# test4 <- paste(test3, collapse="")
# test5 <- paste(test4, test, sep="")
# test6 <- rep.int("-", times=(as.numeric(nchar(as.character(master2[i,7]))-nchar(test5))))
# test7 <- paste(test6, collapse="")
# test8 <- paste(test5, test7, sep="")
# master2[i,15]=test8
# }
# else {
# master2[i,15]="0"
# }
# }
# # For reverse complement
# for (i in 1:nrow(master2)){
# if(!is.na(master2[i,4])){
# test <- tolower(substring(master2[i,10], first = master2[i,4], last = master2[i,5])) # make reverse complement repeats lowercase
# test2 <- ifelse((as.numeric(master2[i,4])>as.numeric(master2[i,8])), (as.numeric(master2[i,4])+as.numeric(nchar(as.character(master2[i,11]))-1)), (as.numeric(master2[i,4])-1))
# test3 <- rep.int("-", times=test2)
# test4 <- paste(test3, collapse="")
# test5 <- paste(test4, test, sep="")
# test6 <- rep.int("-", times=(as.numeric(nchar(as.character(master2[i,7]))-nchar(test5))))
# test7 <- paste(test6, collapse="")
# test8 <- paste(test5, test7, sep="")
# master2[i,16]=test8
# }
# else {
# master2[i,16]="0"
# }
# }
# colnames(master2)[15:16] <- c("Loop-out", "Snap-back")
# test <- reshape(master2,
# varying=c("RECONSTRUCTED_SEQ", "Loop-out", "Snap-back"),
# idvar="ID",
# v.names="insertion_alignment",
# timevar="mechanism",
# times=c("seq", "Loop-out", "Snap-back"),
# new.row.names=1:10000,
# direction="long")
# test <- test[order(test$ID),]
# test$unicorn <- paste(test[,1], test[,14], test[,15], sep="-")
# test2 <- test[!duplicated(test[,16]),]
# # save it, before making a slightly cleaner version
# # Make pretty, two columns, ID, and aligned seq
# pretty <- cbind(ID=test2[,1], as.data.frame(test2[,15]), as.data.frame(test2[,14]))
# colnames(pretty)[2:3] <- c("insertion_alignment", "mechanism")
#
# output1 <- paste(plasmid, "testdata_insertion_consistency3.csv", sep="")
# output2 <- paste(plasmid, "testdata_insertion_consistency_long3.csv", sep="")
# output3 <- paste(plasmid, "testdata_insertion_alignment3.csv", sep="")
# write.csv(master2, output1)
# write.csv(test2, output2)
# write.csv(pretty, output3)
This diff is collapsed.
......@@ -5,15 +5,22 @@ suppressPackageStartupMessages(library(tidyverse))
suppressPackageStartupMessages(library(Biostrings))
# test if there is at least 5 argument: if not, return an error
if (length(args)<5) {
stop("Usage: Rscript process_hifibr.R input_file.csv outdir search_radius debug", call.=FALSE)
} else if (length(args)==5) {
input_file = args[1]
out_dir = args[2]
search_radius=as.integer(args[3])
break_location=as.integer(args[4])
debug=as.integer(args[5])
}
if (length(args)<5) {
stop("Usage: Rscript process_hifibr.R input_file.csv outdir search_radius debug", call.=FALSE)
} else if (length(args)==5) {
input_file = args[1]
out_dir = args[2]
search_radius=as.integer(args[3])
break_location=as.integer(args[4])
debug=as.integer(args[5])
}
##
#input_file="~/Box/rebecca_documents/sdmmej_data/rerun_inconsistent_14Oct20/PolyA1Seq_reclassified_inconsistent.csv"
#out_dir="~/Box/rebecca_documents/sdmmej_data/rerun_inconsistent_14Oct20/"
#search_radius=30
#break_location=161
#debug=1
print(paste0("Input file: " ,input_file))
print(paste0("Output dir: ", out_dir))
......@@ -21,18 +28,6 @@ print(paste0("Search radius: ", search_radius))
print(paste0("Break location: ", break_location))
print(paste0("Debug: ", debug))
## functions
##
# input_file="~/Documents/git/sdmmej/test_data_2/PolyA1Seq_questions.csv"
# out_dir="~/Documents/git/sdmmej/test_data_2/"
# search_radius=30
# break_location=161
# debug=1
#input_file="~/Documents/git/sdmmej/test_data/TestData_HiFiBR_Output_mod.csv"
hifibr_input = read.csv(input_file)
## add an id col and col for reclassified CLASS
......@@ -55,9 +50,9 @@ if ( n_ref != 1){
}
# separate sequences and filter for length
hifibr_input_filter_del = hifibr_input %>% dplyr::filter(.,READS > 10 & CLASS == 'deletion')
hifibr_input_filter_ins = hifibr_input %>% dplyr::filter(.,READS > 10 & CLASS == 'insertion')
hifibr_input_filter_complex = hifibr_input %>% dplyr::filter(.,READS > 10 & CLASS == 'complex')
hifibr_input_filter_del = hifibr_input %>% dplyr::filter(.,READS >= 10 & CLASS == 'deletion')
hifibr_input_filter_ins = hifibr_input %>% dplyr::filter(.,READS >= 10 & CLASS == 'insertion')
hifibr_input_filter_complex = hifibr_input %>% dplyr::filter(.,READS >= 10 & CLASS == 'complex')
# process deletions
deletions = hifibr_input_filter_del$ALIGNED_SEQ
......@@ -72,112 +67,41 @@ for (seq in deletions){
aligned_deletions = c(aligned_deletions,align_string)
}
#print("we have these aligned deletions")
#print(aligned_deletions)
if(debug == 1){
print("we have these aligned deletions")
print(aligned_deletions)
}
# process insertions
insertions = hifibr_input_filter_ins$ALIGNED_SEQ
#print("we have these insertions")
#print(insertions)
if(debug == 1){
print("we have these insertions")
print(insertions)
}
# process complex
comp_unknown = c()
# ### debug
# seq = hifibr_input_filter_complex[2, 'ALIGNED_SEQ']
# id = hifibr_input_filter_complex[2,'ID']
# print(seq)
# print(id)
# sigma <- nucleotideSubstitutionMatrix(match = 2, mismatch = -1, baseOnly = FALSE)
# align <- pairwiseAlignment(seq, ref_seq, substitutionMatrix = sigma, gapOpening = -2,
# gapExtension = -8, scoreOnly = FALSE)
#
# ins_info = insertion(align)[[1]]
# del_info = deletion(align)[[1]]
# mis_info = mismatchTable(align)
#
# ins_n = length(ins_info)
# ins_pos = start(ins_info)
# ins_r = abs(break_location - ins_pos)
#
# del_n = length(del_info)
# del_pos = start(del_info)
# del_r = abs(break_location - del_pos)
#
# mis_n = nmismatch(align)
# mis_pos = mis_info$SubjectStart
# mis_r = abs(break_location - mis_pos)
#
#
# print(c(ins_n,ins_pos, ins_r))
# print(c(del_n,del_pos, del_r))
# print(c(mis_n,mis_pos, mis_r))
#
# ## it seems they are always IRanges objects of length 1
# label = ""
#
# ## check number of deletions
# deletion=deletion(align)[[1]]
# n_del=length(deletion)
# print(n_del)
# # check
# if (n_del >1){
# label = "complex"
# }
#
# # check position of deletions
# pos_del=start(deletion)[1]
# r_del=abs(break_location-pos_del)
# print(r_del)
# if (r_del > search_radius){
# label = "complex"
# }
#
# # check number of insertions
# insertion=insertion(align)[[1]]
# n_ins=length(insertion)
# print(n_ins)
# if (n_ins >1){
# label="complex"
# }
#
# # check position of insertions
# pos_ins=start(insertion)[1]
# print(pos_ins)
#
# r_ins=abs(break_location-pos_ins)
# print(r_ins)
# # check
# if (r_ins > search_radius){
# label="complex"
# }
#
# # check position of mismatch, assuming it is only one
# pos_mis=mismatchTable(align)$SubjectStart
# r_mis=abs(break_location-pos_mis)
#
# if (r_mis > search_radius){
# label="complex"
# }
#
#
# print(label)
###
for (row in 1:nrow(hifibr_input_filter_complex)){
seq = hifibr_input_filter_complex[row, 'ALIGNED_SEQ']
id = hifibr_input_filter_complex[row,'ID']
if(debug == 1){
print(id)
print(paste0("id is: ",id))
}
sigma <- nucleotideSubstitutionMatrix(match = 2, mismatch = -1, baseOnly = FALSE)
align <- pairwiseAlignment(seq, ref_seq, substitutionMatrix = sigma, gapOpening = -2,
gapExtension = -8, scoreOnly = FALSE)
if(debug == 1){
print("ref")
print(toString(align@subject))
print("seq")
print(toString(align@pattern))
print("insertion")
print(insertion(align))
print("deletion")
print(deletion(align))
print("mismatch")
print(mismatchTable(align))
}
......@@ -188,11 +112,20 @@ for (row in 1:nrow(hifibr_input_filter_complex)){
ins_n = length(ins_info)
ins_pos = start(ins_info)
ins_r = abs(break_location - ins_pos)
del_n = length(del_info)
del_pos = start(del_info)
del_r = abs(break_location - del_pos)
if(debug==1){
print("deln")
print(del_n)
print("del_pos")
print(del_pos)
print("del_r")
print(del_r)
}
mis_n = nmismatch(align)
mis_pos = mis_info$SubjectStart
mis_r = abs(break_location - mis_pos)
......@@ -206,10 +139,10 @@ for (row in 1:nrow(hifibr_input_filter_complex)){
if(debug == 1){
print("found del")
print("ref")
print(toString(align@subject))
print("seq")
print(toString(align@pattern))
}
}else{
if(debug == 1){
print(paste0("found del outside search radius at ", del_r))
}
}
}else if (mis_n == 0 & del_n ==0 & ins_n == 1 ){
......@@ -219,18 +152,10 @@ for (row in 1:nrow(hifibr_input_filter_complex)){
if(debug == 1){
print("found ins")
print("ref")
print(toString(align@subject))
print("seq")
print(toString(align@pattern))
}
}else{
if(debug == 1){
print("found complex (del outside search radius)")
print("ref")
print(toString(align@subject))
print("seq")
print(toString(align@pattern))
}
comp_unknown = c(comp_unknown, seq)
hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'complex'
......@@ -242,18 +167,10 @@ for (row in 1:nrow(hifibr_input_filter_complex)){
if(debug == 1){
print("found single mismatch ins")
print("ref")
print(toString(align@subject))
print("seq")
print(toString(align@pattern))
}
}else{
if(debug == 1){
print("found complex (ins outside search radius)")
print("ref")
print(toString(align@subject))
print("seq")
print(toString(align@pattern))
}
comp_unknown = c(comp_unknown, seq)
hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'complex'
......@@ -261,10 +178,6 @@ for (row in 1:nrow(hifibr_input_filter_complex)){
}else{
if(debug == 1){
print("found complex")
print("ref")
print(toString(align@subject))
print("seq")
print(toString(align@pattern))
}
comp_unknown = c(comp_unknown, seq)
hifibr_input[hifibr_input$ID == id,]$CLASS_final = 'complex'
......
#!/bin/bash
## takes one argument which is the hifi output
if [ "$#" -ne 1 ]; then
echo "Error: please supply the path to the HiFibr input file as a command line argument"
fi
input=$1
nick=$2
DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
## These parameters are the defaults, can be changed in this script if needed
search_radius=30
breakpoint=161
debug=0
##
bn=$( basename ${input%.csv} )
results_dir=${DIR}/$( dirname $input )/${bn}_output
results_dir=$( pwd )/$( dirname $input )/${bn}_output
mkdir -p ${results_dir}
echo ${results_dir}
echo "Results will be located in ${results_dir}"
hifi_reclass=${results_dir}/${bn}_reclassified.csv
deletion_out=${results_dir}/${bn}_deletion.txt
insertion_out=${results_dir}/${bn}_insertion.txt
source /anaconda3/etc/profile.d/conda.sh
conda deactivate
conda activate sdmmej
cd ~/Documents/git/sdmmej
echo "------"
echo "Starting Hifibr processing"
#Rscript process_hifibr.R $input $results_dir
echo "Done Hifiber processin"
echo "Starting to process HiFibr output file"
echo "------"
Rscript process_hifibr.R $input $results_dir $search_radius $breakpoint $debug
echo "------"
echo "Done Hifiber processing"
echo "------"
echo "Starting deletion consistency script"
#python ${DIR}/deletion/SDMMEJDeletionProgram_cli.py -hi ${hifi_reclass} -del ${deletion_out} -n ${nick} -out $results_dir
cd deletion/
python SDMMEJDeletionProgram_cli.py -hi ${hifi_reclass} -del ${deletion_out} -n $breakpoint -out $results_dir
echo "------"
echo "Done deletion script"
echo "------"
echo "------"
echo "Starting insertion consistency script"
Rscript insertion/INSERTION_PROGRAM.R ${hifi_reclass} ${insertion_out} ${nick} ${results_dir}
echo "Done insertion script"
echo "------"
cd ../insertion/
Rscript INSERTION_PROGRAM.R ${hifi_reclass} ${insertion_out} $results_dir $breakpoint $search_radius
echo "------"
echo "Done insertion script"
echo "------"
No preview for this file type
original reference has flanking sequence:
before = ACACTCTTTCCCTACACGACGCTCTTCCGATCTGAG
after = AGGCAGATCGGAAGAGCACACGTCTGAACTCCAGTC
rest of reference:
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
before break:
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTAT
after break:
CCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
#gap deletions
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTAT----------GGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTAT------------------------------CTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACAT-----------CCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCT--TATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG (is labeled complex line 3 in test)
For Python progr
#nogap deletions
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG (is labeled complex line 3 in test)
#insertions
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTATTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG ( is labeled complex line 9 in test)
# understand complex sequences
# insertion
70M4D4I97M
G>A
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG #ref
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTATTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG #complex
#deletion
70M4D2I97M
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCT--TATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
#!/bin/bash
muscle -in complex_i.fa -out complex_i.out.fa
cat complex_i.out.fa | awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' > complex_i.out.single.fa
muscle -in complex_d.fa -out complex_d.out.fa
cat complex_d.out.fa | awk '/^>/ {printf("\n%s\n",$0);next; } { printf("%s",$0);} END {printf("\n");}' > complex_d.out.single.fa
>ref
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>complex_d
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>ref
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGT
ACATTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGG
CCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>complex_d
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGT
ACATTACCC--TTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGG
CCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>ref
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>complex_d
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCC--TTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>ref
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>complex_i
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTATTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>ref
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGT
ACATTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGG
CCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>complex_i
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGT
ACATTACCCTATTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGG
CCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG
>ref
GGAAAAAATTCGTACTTTGGAGTACGAAATGCGTCGTTTAGAGCAGCAGCCGAATTCGGTACATTACCCTGTTATCCCTAGCTATGGTCTGCGCTACTAGTGGATCTGGGGCCGCATAGGCCATCCTCTAGAGTCGACCTCGAACGTAAACGTTAACGTAACGTTAACTCG