Commit e4cb6166 authored by Rebecca E Batorsky's avatar Rebecca E Batorsky
Browse files

updated files

parent 7839f651
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="$PROJECT_DIR$/.." vcs="Git" />
</component>
</project>
\ No newline at end of file
from EJR_junction_auxfns import *
from EJR_junction_pool import *
from EJR_junction_class import *
import argparse
import pandas as pd
import os
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument("-hi", dest="h_in", help="hifiber results, processed by previous step", required=True)
parser.add_argument("-del", dest="d_in", help="deletion file", required=True)
parser.add_argument("-out", dest="dir_path", help="directory_for_outputs", required=True)
parser.add_argument("-n", dest="nick", help="location of the nick (the 0-based reference base after which nick occurs", required=True)
parser.add_argument("-r", dest="search_radius", help="number of bases to the left and right of the repair junction to search for SD-MMEJ consistent repeats", required=False, default=30)
parser.add_argument("-m", dest="min_consist_rep_size", help="minimum size of an SD-MMEJ consistent repeat", required=False, default=4)
args = parser.parse_args()
return args.h_in, args.d_in, args.dir_path, int(args.nick), int(args.search_radius), int(args.min_consist_rep_size)
def parse_ref(h_in, nick):
hifi_input = pd.read_csv(h_in)
reference = hifi_input.loc[hifi_input['CLASS'] == 'exact']
if (len(reference) != 1):
print(len(reference))
print("ERROR wrong number of reference sequences: ", len(reference))
original_seq = reference['ALIGNED_SEQ'].values[0]
left_dna = original_seq[0:nick]
right_dna = original_seq[nick:]
mid_dna = ''
check_mid = right_dna[0:4]
if (check_mid == "TTAT"):
right_dna = right_dna[4:]
mid_dna = "TTAT"
left_dna = left_dna.upper()
mid_dna = mid_dna.upper()
right_dna = right_dna.upper()
rightnick =(len(left_dna) + len(mid_dna))
return original_seq, left_dna, mid_dna, right_dna, rightnick
h_in, d_in, dir_path, nick, search_radius, min_consist_rep_size = parse_args()
original_seq, left_dna, mid_dna, right_dna, rightnick = parse_ref(h_in, nick)
# get and process data file
data_file = d_in
data_file = data_file.strip()
df = open(data_file)
base_name = os.path.basename(data_file).strip('.txt')
# get output filename
outfile = dir_path + "/" + base_name + "_consistency_log.txt"
print(outfile)
outfile = outfile.strip()
outfile = open(outfile, "w")
outfile2 = dir_path + "/" + base_name + "_consistency_table.txt"
outfile2 = open(outfile2, "w")
junction_list = []
junction_indices_list = []
linecounter = 0
for line in df:
linecounter +=1
line = line.strip()
if line:
try:
junction_list.append(line)
junction_indices_list.append(get_deln_indices(original_seq, line))
except:
print "error reading sequence in data file: line ", linecounter
print "content of incorrect sequence is ", line
outfile.write("error in data file on line ")
outfile.write(str(linecounter))
outfile.write("\n")
outfile.write("content of incorrect sequence is ")
outfile.write(str(line))
outfile.write("\n")
outfile.write("\n")
continue
outfile.write("\n")
outfile.write("\n")
print
print "======================== RESULTS (A): ============================"
outfile.write("======================== RESULTS (A): ============================")
outfile.write("\n")
print "========= SUMMARY OF INPUT DATA AND ANALYSIS PARAMETERS =========="
outfile.write("========= SUMMARY OF INPUT DATA AND ANALYSIS PARAMETERS ==========")
outfile.write("\n")
print
outfile.write("\n")
print "%i junction sequences successfully read from file" %(len(junction_indices_list))
a = "%i junction sequences successfully read from file" %(len(junction_indices_list))
outfile.write(str(a))
print
outfile.write("\n")
z = zip(junction_list, junction_indices_list)
label = 1
table_contents = []
for pair in z:
seq, indices = pair
#print seq, indices
labelstr = str(label)
table_line = labelstr.rjust(3) + ": " + seq
table_contents.append(table_line)
label += 1
for line in table_contents:
print line
outfile.write(str(line))
outfile.write("\n")
outfile.write("\n")
outfile.write("\n")
print
outfile.write("\n")
a = "* # bases to search left/right of junction for SD-MMEJ consistent repeats: %i" %(search_radius)
print a
outfile.write(str(a))
outfile.write("\n")
a = "* Minimum SD-MMEJ consistent repeat size: %i" %(min_consist_rep_size)
print a
outfile.write(str(a))
outfile.write("\n")
a = "----------end summary of input data and analysis parameters--------"
print a
outfile.write(str(a))
outfile.write("\n")
outfile.write("\n")
print
outfile.write("\n")
print
outfile.write("\n")
a = "======================== RESULTS (B): ============================"
print a
outfile.write(str(a))
outfile.write("\n")
a = "=============== ALIGNMENT OF INPUT DATA SEQUENCES ================"
print a
outfile.write(str(a))
outfile.write("\n")
#get EJR objects from junction_indices_list
ejrj_list = []
for indices in junction_indices_list:
ejrj = EJR_junction(original_seq, indices, nick, rightnick)
ejrj_list.append(ejrj)
num_ABJs, num_MHJs = 0, 0
for ejrj in ejrj_list:
if ejrj.jnxn_type == "ABJ": num_ABJs +=1
elif ejrj.jnxn_type == "MHJ": num_MHJs +=1
num_delns = len(ejrj_list)
print
outfile.write("\n")
outfile.write("\n")
a = "* %i total junctions (%i ABJs, %i MHJs)" %(num_delns, num_ABJs, num_MHJs)
print a
outfile.write(str(a))
outfile.write("\n")
a = "* Overhangs in original sequence in lowercase in tables below."
print a
outfile.write(str(a))
outfile.write("\n")
a = "* Ambiguous bases at MHJs have been assigned to the left."
print a
outfile.write(str(a))
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
ABJ_list = []
for ejrj in ejrj_list:
if ejrj.jnxn_type == "ABJ":
ABJ_list.append(ejrj)
ori_printseq = left_dna + mid_dna.lower() + right_dna
headerstring1 = "deln".rjust(len(ori_printseq) + 7) + " deln"
headerstring2 = "lowercase in original sequence indicates overhangs".center(len(ori_printseq)) + "bounds".center(9) + "size"
ABJ_tablestrings = []
for ejrj in ABJ_list:
tablestring = ejrj.align_deln_raw_indices() + " " + str(ejrj.raw_deln_indices) + " " + str(ejrj.deln_size).rjust(3)
ABJ_tablestrings.append(tablestring)
a = "--Apparent Blunt Joins--".center(len(headerstring2))
print a
outfile.write(str(a))
outfile.write("\n")
print headerstring1
outfile.write(str(headerstring1))
outfile.write("\n")
print headerstring2
outfile.write(str(headerstring2))
outfile.write("\n")
a = "-"*len(headerstring2)
print a
outfile.write(str(a))
outfile.write("\n")
print ori_printseq
outfile.write(str(ori_printseq))
outfile.write("\n")
a = "-"*len(headerstring2)
print a
outfile.write(str(a))
outfile.write("\n")
for tablestring in ABJ_tablestrings:
print tablestring
outfile.write(str(tablestring))
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
MHJ_list = []
for ejrj in ejrj_list:
if ejrj.jnxn_type == "MHJ":
MHJ_list.append(ejrj)
MHJ_tablestrings = []
for ejrj in MHJ_list:
tablestring = ejrj.align_deln_conv_indices() + " " + str(ejrj.converted_deln_indices) + str(ejrj.deln_size).center(6) + str(ejrj.mh_len).center(6) +str(ejrj.mh_rep_in_original_seq)
MHJ_tablestrings.append(tablestring)
headerstring3 = " "*(len(ori_printseq) + 1) + "deln".center(len(str(ejrj.converted_deln_indices))) + "deln".center(6) + "mh".center(6) + "mh indices".center(20)
headerstring4 = "lowercase in original sequence indicates overhangs".center(len(ori_printseq)+1) + "bounds".center(len(str(ejrj.converted_deln_indices))) + "size".center(6) + "len".center(6) + "in original seq".center(20)
a = "--Microhomology Joins--".center(len(headerstring2))
print a
outfile.write(str(a))
outfile.write("\n")
print headerstring3
outfile.write(str(headerstring3))
outfile.write("\n")
print headerstring4
outfile.write(str(headerstring4))
outfile.write("\n")
a = "-"*len(headerstring4)
print a
outfile.write(str(a))
outfile.write("\n")
print ori_printseq
outfile.write(str(ori_printseq))
outfile.write("\n")
a = "-"*len(headerstring4)
print a
outfile.write(str(a))
outfile.write("\n")
for tablestring in MHJ_tablestrings:
print tablestring
outfile.write(str(tablestring))
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
a = "----------end alignment of input data sequences--------"
print a
outfile.write(str(a))
print
outfile.write("\n")
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
a = "======================== RESULTS (C): ============================"
print a
outfile.write(str(a))
outfile.write("\n")
a = "=============== DELETION BOUNDARY FREQUENCY DATA ================="
print a
outfile.write(str(a))
outfile.write("\n")
# maybe go back and add slice indices to tables; maybe also make tables
# that are broken out by ABJ/MHJ?
print
outfile.write("\n")
outfile.write("\n")
a = "* Deletion boundaries are defined as the first base remaining to"
print a
outfile.write(str(a))
outfile.write("\n")
a = " the left of the right-hand nick and the first base remaining to"
print a
outfile.write(str(a))
outfile.write("\n")
a = " the right of the left-hand nick."
print a
outfile.write(str(a))
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
a = "* A consequence of this definition of deletion boundary is that"
print a
outfile.write(str(a))
outfile.write("\n")
a = " values in the _del right_ and _del left_ columns in the tables below"
print a
outfile.write(str(a))
outfile.write("\n")
a = " will NOT necessarily correspond to the net number of bases deleted."
print a
outfile.write(str(a))
outfile.write("\n")
a = " See documentation for more detail as to why."
print a
outfile.write(str(a))
outfile.write("\n")
a = "* Deletion boundaries for microhomology junctions are calculated"
print a
outfile.write(str(a))
outfile.write("\n")
a = " with the ambiguous bases assigned to the left of the junction."
print a
outfile.write(str(a))
outfile.write("\n")
# getting the right-hand deletion boundaries:
#put all the deln indices of the ABJs and the converted deln indices of the MHJs into a list
#the indices you want are the second ones in each tuple.
#make a list of those.
#to convert those indices to bases deleted to the right,
#find the length of the dna to the left of the left nick
#subtract that from the index, and make a new list of the
#converted values.
#make a string of the overhangs plus right dna
#for index, value in enumerate that string,
#list.count(index)
#make a new list of 3-member tuples where you glue together the index, value, and the count
indices_list = []
for ejrj in ejrj_list:
if ejrj.jnxn_type == "ABJ":
indices_list.append(ejrj.raw_deln_indices)
if ejrj.jnxn_type == "MHJ":
indices_list.append(ejrj.converted_deln_indices)
rightbound_index_list = []
for item in indices_list:
rightbound_index_list.append(item[1])
deln_to_right_list = []
for item in rightbound_index_list:
bases_del_right = item - len(left_dna)
deln_to_right_list.append(bases_del_right)
mid_and_right_dna = mid_dna + right_dna
table_line_tuple_list = []
for index, value in enumerate(mid_and_right_dna):
freq = deln_to_right_list.count(index)
table_line = (index, value, freq)
table_line_tuple_list.append(table_line)
table_lines_list = []
for item in table_line_tuple_list:
index, value, freq = item
table_line = str(index).center(5) + str(value).center(5) + str(freq).center(5)
table_lines_list.append(table_line)
headerstringC1 = "Deletion Boundary Frequencies"
headerstringC2 = "first base remaining to RIGHT of LEFT nick:"
headerstringC3 = "-"*15
headerstringC4= "del".center(5) + " "*10
headerstringC5 = "right".center(5) + "base".center(5) + "freq".center(5)
print headerstringC1
outfile.write(str(headerstringC1))
outfile.write("\n")
print headerstringC2
outfile.write(str(headerstringC2))
outfile.write("\n")
print headerstringC3
outfile.write(str(headerstringC3))
outfile.write("\n")
print headerstringC4
outfile.write(str(headerstringC4))
outfile.write("\n")
print headerstringC5
outfile.write(str(headerstringC5))
outfile.write("\n")
print headerstringC3
outfile.write(str(headerstringC3))
outfile.write("\n")
for line in table_lines_list:
print line
outfile.write(str(line))
outfile.write("\n")
print headerstringC3
outfile.write(str(headerstringC3))
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
#getting the left-hand deletion boundaries:
##put all the deln indices of the ABJs and the converted deln indices of the MHJs into a list
#the indices you want are the first ones in each tuple.
#make a list of those.
#to convert those indices to bases deleted to the left,
#find the length of left_dna + mid_dna
#subtract the index from that.
#make a new list of the # bp deleted thereby calculated
#reverse the left_dna + mid_dna string
#for index, value in enumerate that string,
#list.count(index)
#make a new list of 3-member tuples where you glue together the index, value, and the count
indices_list = []
for ejrj in ejrj_list:
if ejrj.jnxn_type == "ABJ":
indices_list.append(ejrj.raw_deln_indices)
if ejrj.jnxn_type == "MHJ":
indices_list.append(ejrj.converted_deln_indices)
leftbound_index_list = []
for item in indices_list:
leftbound_index_list.append(item[0])
deln_to_left_list = []
for item in leftbound_index_list:
bases_del_left = len(left_dna + mid_dna) - item
deln_to_left_list.append(bases_del_left)
left_and_center = left_dna + mid_dna
left_and_center = left_and_center[::-1]
table_line_tuple_list = []
for index, value in enumerate(left_and_center):
freq = deln_to_left_list.count(index)
table_line = (index, value, freq)
table_line_tuple_list.append(table_line)
table_line_string_list = []
for item in table_line_tuple_list:
index, value, freq = item
table_line = str(index).center(5) + str(value).center(5) + str(freq).center(5)
table_line_string_list.append(table_line)
headerstringC6 = "first base remaining to LEFT of RIGHT nick:"
headerstringC7 = "left".center(5) + "base".center(5) + "freq".center(5)
print headerstringC1
outfile.write(str(headerstringC1))
outfile.write("\n")
print headerstringC6
outfile.write(str(headerstringC6))
outfile.write("\n")
print headerstringC3
outfile.write(str(headerstringC3))
outfile.write("\n")
print headerstringC4
outfile.write(str(headerstringC4))
outfile.write("\n")
print headerstringC7
outfile.write(str(headerstringC7))
outfile.write("\n")
print headerstringC3
outfile.write(str(headerstringC3))
outfile.write("\n")
for line in table_line_string_list:
print line
outfile.write(str(line))
outfile.write("\n")
print headerstringC3
outfile.write(str(headerstringC3))
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
a = "======================== RESULTS (D): ============================"
print a
outfile.write(str(a))
outfile.write("\n")
a = "============= APPARENT BLUNT JOIN SD-MMEJ ANALYSIS ==============="
print a
outfile.write(str(a))
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
# calcs for ABJ summary
total_ABJs = 0
for ejrj in ejrj_list:
if ejrj.jnxn_type == "ABJ": total_ABJs +=1
num_consistent_ABJs = 0
for ejrj in ejrj_list:
if ejrj.jnxn_type == "ABJ":
if len(ejrj.filtered_cr(min_consist_rep_size, search_radius)) != 0: num_consistent_ABJs +=1
print
outfile.write("\n")
outfile.write("\n")
a = "----------- ABJ SUMMARY ----------"
print a
outfile.write(str(a))
outfile.write("\n")
a = "search radius = %i" %(search_radius)
print a
outfile.write(str(a))
outfile.write("\n")
a = "min consistent rep length = %i" %(min_consist_rep_size)
print a
outfile.write(str(a))
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
a = "%i total ABJs" %(total_ABJs)
print a
outfile.write(str(a))
outfile.write("\n")
a = "%i SD-MMEJ consistent ABJs" %(num_consistent_ABJs)
print a
outfile.write(str(a))
outfile.write("\n")
a = "----------------------------------"
print a
outfile.write(str(a))
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
print
outfile.write("\n")
outfile.write("\n")
# calcs for summary table of SD-MMEJ consistent ABJs (table D1)