Source code for tool.tb_filter
"""
.. See the NOTICE file distributed with this work for additional information
regarding copyright ownership.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""
from __future__ import print_function
import sys
import os.path
from basic_modules.tool import Tool
from utils import logger
try:
if hasattr(sys, '_run_from_cmdl') is True:
raise ImportError
from pycompss.api.parameter import FILE_IN, FILE_OUT, IN
from pycompss.api.task import task
from pycompss.api.api import compss_wait_on
except ImportError:
logger.info("[Warning] Cannot import \"pycompss\" API packages.")
logger.info(" Using mock decorators.")
from utils.dummy_pycompss import FILE_IN, FILE_OUT, IN # pylint: disable=ungrouped-imports
from utils.dummy_pycompss import task # pylint: disable=ungrouped-imports
from utils.dummy_pycompss import compss_wait_on # pylint: disable=ungrouped-imports
from pytadbit.parsers.hic_bam_parser import bed2D_to_BAMhic # pylint: disable=import-error,no-name-in-module
from pytadbit.mapping.filter import apply_filter, filter_reads # pylint: disable=import-error
from pytadbit.mapping.analyze import insert_sizes # pylint: disable=import-error
# ------------------------------------------------------------------------------
[docs]class tbFilterTool(Tool): # pylint: disable=invalid-name
"""
Tool for filtering out experimetnal artifacts from the aligned data
"""
def __init__(self, configuration=None):
"""
Init function
"""
logger.info("TADbit filter aligned reads")
Tool.__init__(self)
if configuration is None:
configuration = {}
self.configuration.update(configuration)
[docs] @task(
reads=FILE_IN, filter_reads_file=FILE_OUT, custom_filter=IN, conservative=IN,
output_de=FILE_OUT, output_d=FILE_OUT, output_e=FILE_OUT,
output_ed=FILE_OUT, output_or=FILE_OUT, output_rb=FILE_OUT,
output_sc=FILE_OUT, output_tc=FILE_OUT, output_tl=FILE_OUT,
output_ts=FILE_OUT, returns=int)
def tb_filter( # pylint: disable=too-many-arguments,too-many-locals,too-many-statements,too-many-branches,no-self-use
self, reads, filter_reads_file, custom_filter, min_dist_to_re, min_fragment_size,
max_fragment_size, conservative, output_de, output_d,
output_e, output_ed, output_or, output_rb, output_sc, output_tc,
output_tl, output_ts):
"""
Function to filter out expoerimental artifacts
Parameters
----------
reads : str
Location of the reads thats that has a matching location at both
ends of the paired reads
filtered_reads_file : str
Location of the filtered reads
conservative : bool
Level of filtering [DEFAULT : True]
Returns
-------
filtered_reads : str
Location of the filtered reads
"""
reads_tmp = reads.replace(".tsv", '')
with open(reads_tmp + "_tmp.tsv", "wb") as f_out:
with open(reads, "rb") as f_in:
f_out.write(f_in.read())
masked = filter_reads(
reads_tmp + "_tmp.tsv",
max_molecule_length=610,
min_dist_to_re=min_dist_to_re,
over_represented=0.005,
max_frag_size=max_fragment_size,
min_frag_size=min_fragment_size,
re_proximity=4)
filter_reads_file_tmp = filter_reads_file.replace(".tsv", '')
filters_suffixes = ['self-circle', 'dangling-end', 'error', 'extra dangling-end',
'too close from REs', 'too short', 'too large', 'over-represented',
'duplicated', 'random breaks']
if custom_filter:
applied_filters = custom_filter
filters_suffixes = [filters_suffixes[i-1] for i in applied_filters]
else:
if conservative is True:
applied_filters = [1, 2, 3, 4, 6, 7, 8, 9, 10]
# Ignore filter 5 (based on docs) as not very helpful
else:
# Less conservative option
applied_filters = [1, 2, 3, 9, 10]
apply_filter(reads_tmp + "_tmp.tsv", filter_reads_file_tmp + "_tmp.tsv",
masked, filters=applied_filters)
with open(filter_reads_file, "wb") as f_out:
with open(filter_reads_file_tmp + "_tmp.tsv", "rb") as f_in:
f_out.write(f_in.read())
for i in filters_suffixes:
report_file_loc = reads_tmp + '_tmp.tsv_' + i + '.tsv'
logger.info(report_file_loc)
if os.path.isfile(report_file_loc) is True:
logger.info("- Present {0}".format(os.path.getsize(report_file_loc)))
with open(report_file_loc, "rb") as f_in:
if i == 'dangling-end':
logger.info("- Saving to:" + output_de)
with open(output_de, "wb") as f_out:
f_out.write(f_in.read())
elif i == 'duplicated':
logger.info("- Saving to:" + output_d)
with open(output_d, "wb") as f_out:
f_out.write(f_in.read())
elif i == 'error':
logger.info("- Saving to:" + output_e)
with open(output_e, "wb") as f_out:
f_out.write(f_in.read())
elif i == 'extra_dangling-end':
logger.info("- Saving to:" + output_ed)
with open(output_ed, "wb") as f_out:
f_out.write(f_in.read())
elif i == 'over-represented':
logger.info("- Saving to:" + output_or)
with open(output_or, "wb") as f_out:
f_out.write(f_in.read())
elif i == 'random_breaks':
logger.info("- Saving to:" + output_rb)
with open(output_rb, "wb") as f_out:
f_out.write(f_in.read())
elif i == 'self-circle':
logger.info("- Saving to:" + output_sc)
with open(output_sc, "wb") as f_out:
f_out.write(f_in.read())
elif i == 'too_close_from_RES':
logger.info("- Saving to:" + output_tc)
with open(output_tc, "wb") as f_out:
f_out.write(f_in.read())
elif i == 'too_large':
logger.info("- Saving to:" + output_tl)
with open(output_tl, "wb") as f_out:
f_out.write(f_in.read())
elif i == 'too_short':
logger.info("- Saving to:" + output_ts)
with open(output_ts, "wb") as f_out:
f_out.write(f_in.read())
return masked
[docs] def run(self, input_files, input_metadata, output_files): # pylint: disable=too-many-locals,too-many-statements,unused-argument
"""
The main function to filter the reads to remove experimental artifacts
Parameters
----------
input_files : list
reads : str
Location of the reads thats that has a matching location at both
ends of the paired reads
metadata : dict
conservative : bool
Level of filtering to apply [DEFAULT : True]
Returns
-------
output_files : list
List of locations for the output files.
output_metadata : list
List of matching metadata dict objects
"""
reads = input_files[0]
conservative = True
custom_filter = None
if 'custom_filter' in input_metadata:
custom_filter = input_metadata['filters']
elif 'conservative' in input_metadata:
conservative = input_metadata['conservative']
min_dist_to_re = 915
max_fragment_size = 100000
min_fragment_size = 100
if 'min_dist_RE' in input_metadata:
min_dist_to_re = int(input_metadata['min_dist_RE'])
if 'min_fragment_size' in input_metadata:
min_fragment_size = int(input_metadata['min_fragment_size'])
if 'max_fragment_size' in input_metadata:
max_fragment_size = int(input_metadata['max_fragment_size'])
root_name = reads.split("/")
filtered_reads_file = "/".join(root_name[0:-1]) + '/' + \
input_metadata['expt_name'] + '_filtered_map.tsv'
output_de = filtered_reads_file + '_dangling-end.tsv'
output_d = filtered_reads_file + '_duplicated.tsv'
output_e = filtered_reads_file + '_error.tsv'
output_ed = filtered_reads_file + '_extra_dangling-end.tsv'
output_or = filtered_reads_file + '_over-represented.tsv'
output_rb = filtered_reads_file + '_random_breaks.tsv'
output_sc = filtered_reads_file + '_self-circle.tsv'
output_tc = filtered_reads_file + '_too_close_from_RES.tsv'
output_tl = filtered_reads_file + '_too_large.tsv'
output_ts = filtered_reads_file + '_too_short.tsv'
# input and output share most metadata
output_metadata = {}
# handle error
results = self.tb_filter(
reads, filtered_reads_file, custom_filter, min_dist_to_re,
min_fragment_size, max_fragment_size, conservative,
output_de, output_d, output_e, output_ed, output_or, output_rb,
output_sc, output_tc, output_tl, output_ts)
results = compss_wait_on(results)
if 'outbam' in input_metadata:
outbam = input_metadata['root_dir'] + '/' + input_metadata['outbam']
bed2D_to_BAMhic(filtered_reads_file, True, 32, outbam, 'mid', results)
filtered_reads_file = outbam
hist_path = ''
if 'histogram' in input_metadata:
hist_path = "/".join(root_name[0:-1]) + '/histogram_fragment_sizes_.png'
log_path = "/".join(root_name[0:-1]) + '/filter_log.txt'
median, max_f, mad = insert_sizes(
reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'),
savefig=hist_path)
orig_stdout = sys.stdout
f_handler = open(log_path, "w")
sys.stdout = f_handler
# insert size
logger.info('Insert size\n')
logger.info(' - median insert size = {0}'.format(median))
logger.info(' - double median absolution of insert size = {0}'.format(mad))
logger.info(' - max insert size (when a gap in continuity of > 10 bp \
is found in fragment lengths) = {0}'.format(max_f))
max_mole = max_f # pseudo DEs
min_dist = max_f + mad # random breaks
logger.info(' Using the maximum continuous fragment size'
'('+str(max_mole)+' bp) to check '
'for pseudo-dangling ends')
logger.info(' Using maximum continuous fragment size plus the MAD '
'('+str(min_dist)+' bp) to check for random breaks')
sys.stdout = orig_stdout
f_handler.close()
return_files = [filtered_reads_file]
if 'histogram' in input_metadata:
return_files += [log_path, hist_path]
return (return_files, output_metadata)
# ------------------------------------------------------------------------------