From ccc1e06a889d2bf7097a38d30240cb0f18bcbddc Mon Sep 17 00:00:00 2001 From: Daniel Malzl <52696584+dmalzl@users.noreply.github.com> Date: Fri, 10 Apr 2020 12:43:00 +0200 Subject: [PATCH 01/12] changed info dict generation numpy.string_ conversion is not readable by cooler library and thus the generated file cannot be imported by HiGlass for visualization. Using the native Python str function fixes this problem --- hicmatrix/lib/cool.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index fd6182b..9e83ea1 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -372,12 +372,12 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): if 'bin-type' in info: del info['bin-type'] - info['format'] = np.string_('HDF5::Cooler') - info['format-url'] = np.string_('https://github.com/mirnylab/cooler') - info['generated-by'] = np.string_('HiCMatrix-' + __version__) - info['generated-by-cooler-lib'] = np.string_('cooler-' + cooler.__version__) + info['format'] = str('HDF5::Cooler') + info['format-url'] = str('https://github.com/mirnylab/cooler') + info['generated-by'] = str('HiCMatrix-' + __version__) + info['generated-by-cooler-lib'] = str('cooler-' + cooler.__version__) - info['tool-url'] = np.string_('https://github.com/deeptools/HiCMatrix') + info['tool-url'] = str('https://github.com/deeptools/HiCMatrix') # info['nchroms'] = int(bins_data_frame['chrom'][:].nunique()) # info['chromosomes'] = list(bins_data_frame['chrom'][:].unique()) @@ -387,13 +387,13 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): # info['sum-elements'] = int(matrix_data_frame['count'].sum()) if self.hic_metadata is not None and 'matrix-generated-by' in self.hic_metadata: - info['matrix-generated-by'] = np.string_(self.hic_metadata['matrix-generated-by']) + info['matrix-generated-by'] = str(self.hic_metadata['matrix-generated-by']) del self.hic_metadata['matrix-generated-by'] if self.hic_metadata is not None and 'matrix-generated-by-url' in self.hic_metadata: - info['matrix-generated-by-url'] = np.string_(self.hic_metadata['matrix-generated-by-url']) + info['matrix-generated-by-url'] = str(self.hic_metadata['matrix-generated-by-url']) del self.hic_metadata['matrix-generated-by-url'] if self.hic_metadata is not None and 'genome-assembly' in self.hic_metadata: - info['genome-assembly'] = np.string_(self.hic_metadata['genome-assembly']) + info['genome-assembly'] = str(self.hic_metadata['genome-assembly']) del self.hic_metadata['genome-assembly'] local_temp_dir = os.path.dirname(os.path.realpath(pFileName)) From e4b0793b2d2d1145dc14075537af7405b8981d8f Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 23 Apr 2020 12:59:04 +0200 Subject: [PATCH 02/12] Fixing correction bug in cool for file created by hic2cool --- hicmatrix/__init__.py | 4 +- hicmatrix/_version.py | 2 +- hicmatrix/lib/cool.py | 117 ++++++++++++++++++++++-------------------- 3 files changed, 66 insertions(+), 57 deletions(-) diff --git a/hicmatrix/__init__.py b/hicmatrix/__init__.py index 7d33780..843f7af 100644 --- a/hicmatrix/__init__.py +++ b/hicmatrix/__init__.py @@ -1,3 +1,5 @@ import logging -logging.basicConfig(level=logging.INFO) +# logging.basicConfig(level=logging.INFO) +logging.basicConfig(level=logging.DEBUG) + logging.getLogger('cooler').setLevel(logging.WARNING) diff --git a/hicmatrix/_version.py b/hicmatrix/_version.py index e2fe542..343d325 100644 --- a/hicmatrix/_version.py +++ b/hicmatrix/_version.py @@ -1,2 +1,2 @@ -__version__ = '12' +__version__ = '13-dev' # Version number differs from HiCExplorer! diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index fd6182b..323b4e3 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -37,7 +37,7 @@ def __init__(self, pMatrixFile=None): self.hic2cool_version = None self.hicmatrix_version = None - self.scaleToOriginalRange = None + # self.scaleToOriginalRange = None # self.correction_factors = None def getInformationCoolerBinNames(self): @@ -65,7 +65,6 @@ def load(self): e log.debug('self.chrnameList {}'.format(self.chrnameList)) if self.chrnameList is None: - log.debug('muh 69') matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) used_dtype = np.int32 @@ -96,8 +95,8 @@ def load(self): del _features matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype) - self.minValue = data.min() - self.maxValue = data.max() + # self.minValue = data.min() + # self.maxValue = data.max() del data del instances @@ -108,12 +107,12 @@ def load(self): log.debug('Load data') matrix = cooler_file.matrix(balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr() # handle the case of an empty csr matrix - if len(matrix.data) == 0: - self.minValue = 0 - self.maxValue = 0 - else: - self.minValue = matrix.data.min() - self.maxValue = matrix.data.max() + # if len(matrix.data) == 0: + # self.minValue = 0 + # self.maxValue = 0 + # else: + # self.minValue = matrix.data.min() + # self.maxValue = matrix.data.max() except ValueError as ve: log.exception("Wrong chromosome format. Please check UCSC / ensembl notation.") ve @@ -126,7 +125,7 @@ def load(self): if self.chrnameList is not None: if len(self.chrnameList) == 1: cut_intervals_data_frame = cooler_file.bins().fetch(self.chrnameList[0]) - + log.debug('cut_intervals_data_frame {}'.format(list(cut_intervals_data_frame.columns))) if self.correctionFactorTable in cut_intervals_data_frame: correction_factors_data_frame = cut_intervals_data_frame[self.correctionFactorTable] else: @@ -156,34 +155,39 @@ def load(self): features_factors = correction_factors[features] if self.correctionOperator is None: + if self.correctionFactorTable in ['KR', 'VC', 'SQRT_VC']: + self.correctionOperator = '/' + else: + self.correctionOperator = '*' + # log.debug('cooler_file.info: {}'.format(cooler_file.info)) if 'generated-by' in cooler_file.info: log.debug('cooler_file.info[\'generated-by\'] {} {}'.format(cooler_file.info['generated-by'], type(cooler_file.info['generated-by']))) generated_by = toString(cooler_file.info['generated-by']) if 'hic2cool' in generated_by: self.hic2cool_version = generated_by.split('-')[1] - if self.hic2cool_version >= '0.5': - log.debug('0.5') - self.correctionOperator = '/' - else: - log.debug('0.4') - - self.correctionOperator = '*' - else: - self.correctionOperator = '*' - - log.debug('hic2cool: {}'.format(self.hic2cool_version)) - log.debug('self.correctionOperator : {}'.format(self.correctionOperator)) - - # elif 'hicmatrix' in generated_by: - - # self.hicmatrix_version = generated_by.split('-')[1] - # if self.hicmatrix_version >= '8': - # self.correctionOperator = '/' - # else: - # self.correctionOperator = '*' - else: - self.correctionOperator = '*' + # if self.hic2cool_version >= '0.5': + # log.debug('0.5') + # self.correctionOperator = '/' + # else: + # log.debug('0.4') + + # self.correctionOperator = '*' + # else: + # self.correctionOperator = '*' + + # log.debug('hic2cool: {}'.format(self.hic2cool_version)) + # log.debug('self.correctionOperator : {}'.format(self.correctionOperator)) + + elif 'hicmatrix' in generated_by: + + self.hicmatrix_version = generated_by.split('-')[1] + # # if self.hicmatrix_version >= '8': + # # self.correctionOperator = '/' + # # else: + # # self.correctionOperator = '*' + # else: + # self.correctionOperator = '*' instances_factors *= features_factors log.debug('hic2cool: {}'.format(self.hic2cool_version)) @@ -194,20 +198,20 @@ def load(self): matrix.data /= instances_factors # if self.scaleToOriginalRange is not None: - min_value = matrix.data.min() - max_value = matrix.data.max() + # min_value = matrix.data.min() + # max_value = matrix.data.max() # check if max smaller one or if not same mangnitude - if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1): - desired_range_difference = self.maxValue - self.minValue + # if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1): + # desired_range_difference = self.maxValue - self.minValue - min_value = matrix.data.min() - max_value = matrix.data.max() + # min_value = matrix.data.min() + # max_value = matrix.data.max() - matrix.data = (matrix.data - min_value) - matrix.data /= (max_value - min_value) - matrix.data *= desired_range_difference - matrix.data += self.minValue - self.scaleToOriginalRange = True + # matrix.data = (matrix.data - min_value) + # matrix.data /= (max_value - min_value) + # matrix.data *= desired_range_difference + # matrix.data += self.minValue + # self.scaleToOriginalRange = True # diff_scale_factor = matrix.data.max() / max_value # if self.correctionOperator == '*': # correction_factors *= diff_scale_factor @@ -276,7 +280,10 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): dtype_pixel = {'bin1_id': np.int32, 'bin2_id': np.int32, 'count': np.int32} if self.correction_factors is not None and pApplyCorrection: dtype_pixel['weight'] = np.float32 - if (self.hic2cool_version is not None and self.hic2cool_version >= '0.5') or self.fileWasH5: + + # if the correction was applied by a division, invert it because cool format expects multiplicative if table name is 'weight' + # https://cooler.readthedocs.io/en/latest/api.html#cooler.Cooler.matrix + if (self.hic2cool_version is not None and self.hic2cool_version >= '0.5') or self.fileWasH5 or self.correctionOperator == '/': log.debug('wash5 true') self.correction_factors = np.array(self.correction_factors).flatten() @@ -306,20 +313,20 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): log.debug('self.correctionOperator: {}'.format(self.correctionOperator)) log.debug('self.fileWasH5: {}'.format(self.fileWasH5)) - if self.scaleToOriginalRange: - min_value = self.matrix.data.min() - max_value = self.matrix.data.max() - desired_range_difference = max_value - min_value + # if self.scaleToOriginalRange: + # min_value = self.matrix.data.min() + # max_value = self.matrix.data.max() + # desired_range_difference = max_value - min_value - self.matrix.data = (self.matrix.data - self.minValue) - self.matrix.data /= (self.maxValue - self.minValue) - self.matrix.data *= desired_range_difference - self.matrix.data += min_value + # self.matrix.data = (self.matrix.data - self.minValue) + # self.matrix.data /= (self.maxValue - self.minValue) + # self.matrix.data *= desired_range_difference + # self.matrix.data += min_value if self.correctionOperator == '*' or self.correctionOperator is None: self.matrix.data /= instances_factors - elif self.correctionOperator == '/' or self.fileWasH5: - self.matrix.data *= instances_factors + # elif self.correctionOperator == '/' or self.fileWasH5: + # self.matrix.data *= instances_factors instances_factors = None features_factors = None From b3d1d72bd8b79761e326e1d848912a64f96396d5 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 30 Apr 2020 11:11:56 +0000 Subject: [PATCH 03/12] adding support for partial load for one chromosome by distance --- hicmatrix/HiCMatrix.py | 4 +-- hicmatrix/lib/cool.py | 40 ++++++++++++++++++++++-------- hicmatrix/lib/matrixFileHandler.py | 6 ++++- 3 files changed, 37 insertions(+), 13 deletions(-) diff --git a/hicmatrix/HiCMatrix.py b/hicmatrix/HiCMatrix.py index 39acf73..8195f32 100644 --- a/hicmatrix/HiCMatrix.py +++ b/hicmatrix/HiCMatrix.py @@ -27,7 +27,7 @@ class hiCMatrix: get sub matrices by chrname. """ - def __init__(self, pMatrixFile=None, pChrnameList=None): + def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None): self.non_homogeneous_warning_already_printed = False self.bin_size = None self.bin_size_homogeneous = None # track if the bins are equally spaced or not @@ -49,7 +49,7 @@ def __init__(self, pMatrixFile=None, pChrnameList=None): fileType = 'cool' if pMatrixFile.endswith('.h5'): fileType = 'h5' - self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList) + self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList, pDistance=pDistance) log.debug('init time: {}'.format(time.time() - start_time)) self.matrix, self.cut_intervals, self.nan_bins, \ self.correction_factors, self.distance_counts = self.matrixFileHandler.load() diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index 323b4e3..e12c720 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -10,7 +10,7 @@ import cooler import h5py import numpy as np -from scipy.sparse import triu, csr_matrix +from scipy.sparse import triu, csr_matrix, lil_matrix import pandas as pd from hicmatrix.utilities import toString, toBytes @@ -37,6 +37,7 @@ def __init__(self, pMatrixFile=None): self.hic2cool_version = None self.hicmatrix_version = None + self.distance = None # self.scaleToOriginalRange = None # self.correction_factors = None @@ -66,6 +67,7 @@ def load(self): log.debug('self.chrnameList {}'.format(self.chrnameList)) if self.chrnameList is None: + matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) used_dtype = np.int32 if np.iinfo(np.int32).max < cooler_file.info['nbins']: @@ -105,17 +107,35 @@ def load(self): if len(self.chrnameList) == 1: try: log.debug('Load data') - matrix = cooler_file.matrix(balance=False, sparse=True).fetch(self.chrnameList[0]).tocsr() - # handle the case of an empty csr matrix - # if len(matrix.data) == 0: - # self.minValue = 0 - # self.maxValue = 0 - # else: - # self.minValue = matrix.data.min() - # self.maxValue = matrix.data.max() + log.debug('self.distance {}'.format(self.distance)) + if self.distance is None: + matrix = cooler_file.matrix(balance=False, sparse=True, as_pixels=False).fetch(self.chrnameList[0]).tocsr() + + else: + lo, hi = cooler_file.extent(self.chrnameList[0]) + dist = 2000000 // cooler_file.binsize + step = (hi - lo) // 16 + if step < 1: + step = 1 + print('hi {}; lo {}'.format(hi,lo)) + mat = lil_matrix((hi - lo, hi - lo)) + row = [] + col = [] + data = [] + for i0, i1 in cooler.util.partition(lo, hi, step): + # fetch stripe + pixels = cooler_file.matrix(balance=False, as_pixels=True)[i0:i1, lo:hi] + # filter + pixels = pixels[(pixels['bin2_id'] - pixels['bin1_id']) < dist] + # insert into sparse matrix + + mat[pixels['bin1_id'] - lo, pixels['bin2_id'] - lo] = pixels['count'] + mat[pixels['bin2_id'] - lo, pixels['bin1_id'] - lo] = pixels['count'] + matrix = mat.tocsr() + del mat except ValueError as ve: log.exception("Wrong chromosome format. Please check UCSC / ensembl notation.") - ve + log.exception('Error: {}'.format(str(ve))) else: raise Exception("Operation to load more as one region is not supported.") diff --git a/hicmatrix/lib/matrixFileHandler.py b/hicmatrix/lib/matrixFileHandler.py index 233d51e..dd8a5fc 100644 --- a/hicmatrix/lib/matrixFileHandler.py +++ b/hicmatrix/lib/matrixFileHandler.py @@ -10,7 +10,8 @@ class MatrixFileHandler(): def __init__(self, pFileType='cool', pMatrixFile=None, pChrnameList=None, pApplyCorrectionCoolerLoad=None, pBedFileHicPro=None, pCorrectionFactorTable=None, - pCorrectionOperator=None, pEnforceInteger=None, pAppend=None, pFileWasH5=None, pHiCInfo=None, pHic2CoolVersion=None): + pCorrectionOperator=None, pEnforceInteger=None, pAppend=None, pFileWasH5=None, pHiCInfo=None, pHic2CoolVersion=None, + pDistance=None): self.class_ = getattr(importlib.import_module('.' + pFileType.lower(), package='hicmatrix.lib'), pFileType.title()) @@ -39,6 +40,9 @@ def __init__(self, pFileType='cool', pMatrixFile=None, pChrnameList=None, log.debug('pHic2CoolVersion : {}'.format(pHic2CoolVersion)) if pHic2CoolVersion is not None: self.matrixFile.hic2cool_version = pHic2CoolVersion + if pDistance is not None: + self.matrixFile.distance = pDistance + log.debug('self.distance {}'.format(self.matrixFile.distance)) def load(self): From 31358a1b5d103bddc8e939cff896371a58073cf0 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Mon, 4 May 2020 19:38:36 +0000 Subject: [PATCH 04/12] Adding parameters to not load unused variables --- hicmatrix/HiCMatrix.py | 13 +++++---- hicmatrix/lib/cool.py | 64 +++++++++--------------------------------- 2 files changed, 21 insertions(+), 56 deletions(-) diff --git a/hicmatrix/HiCMatrix.py b/hicmatrix/HiCMatrix.py index 8195f32..535223d 100644 --- a/hicmatrix/HiCMatrix.py +++ b/hicmatrix/HiCMatrix.py @@ -27,7 +27,7 @@ class hiCMatrix: get sub matrices by chrname. """ - def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None): + def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None, pNoIntervalTree=None, pUpperTriangleOnly=None): self.non_homogeneous_warning_already_printed = False self.bin_size = None self.bin_size_homogeneous = None # track if the bins are equally spaced or not @@ -65,7 +65,8 @@ def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None): if self.nan_bins is None: self.nan_bins = np.array([]) - self.fillLowerTriangle() + if pUpperTriangleOnly is None or not pUpperTriangleOnly: + self.fillLowerTriangle() log.debug('triangle time: {}'.format(time.time() - start_time)) start_time = time.time() @@ -76,9 +77,11 @@ def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None): start_time = time.time() log.debug('restoreMaskedBins') - - self.interval_trees, self.chrBinBoundaries = \ - self.intervalListToIntervalTree(self.cut_intervals) + if pNoIntervalTree is None or not pNoIntervalTree: + self.interval_trees, self.chrBinBoundaries = \ + self.intervalListToIntervalTree(self.cut_intervals) + else: + log.debug('no intervaltree') log.debug('intervalListToIntervalTree: {}'.format(time.time() - start_time)) start_time = time.time() diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index e12c720..a7bd604 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -6,6 +6,7 @@ import math import time +import gc import cooler import h5py @@ -95,7 +96,7 @@ def load(self): del _data del _instances del _features - + gc.collect() matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype) # self.minValue = data.min() # self.maxValue = data.max() @@ -103,6 +104,7 @@ def load(self): del data del instances del features + gc.collect() else: if len(self.chrnameList) == 1: try: @@ -113,15 +115,11 @@ def load(self): else: lo, hi = cooler_file.extent(self.chrnameList[0]) - dist = 2000000 // cooler_file.binsize - step = (hi - lo) // 16 + dist = self.distance // cooler_file.binsize + step = (hi - lo) // 32 if step < 1: step = 1 - print('hi {}; lo {}'.format(hi,lo)) - mat = lil_matrix((hi - lo, hi - lo)) - row = [] - col = [] - data = [] + mat = lil_matrix((hi - lo, hi - lo), dtype=np.float32) for i0, i1 in cooler.util.partition(lo, hi, step): # fetch stripe pixels = cooler_file.matrix(balance=False, as_pixels=True)[i0:i1, lo:hi] @@ -129,10 +127,15 @@ def load(self): pixels = pixels[(pixels['bin2_id'] - pixels['bin1_id']) < dist] # insert into sparse matrix - mat[pixels['bin1_id'] - lo, pixels['bin2_id'] - lo] = pixels['count'] - mat[pixels['bin2_id'] - lo, pixels['bin1_id'] - lo] = pixels['count'] + mat[pixels['bin1_id'] - lo, pixels['bin2_id'] - lo] = pixels['count'].astype(np.float32) + # mat[pixels['bin2_id'] - lo, pixels['bin1_id'] - lo] = pixels['count'] + del pixels + gc.collect() + matrix = mat.tocsr() del mat + gc.collect() + except ValueError as ve: log.exception("Wrong chromosome format. Please check UCSC / ensembl notation.") log.exception('Error: {}'.format(str(ve))) @@ -179,35 +182,15 @@ def load(self): self.correctionOperator = '/' else: self.correctionOperator = '*' - # log.debug('cooler_file.info: {}'.format(cooler_file.info)) if 'generated-by' in cooler_file.info: log.debug('cooler_file.info[\'generated-by\'] {} {}'.format(cooler_file.info['generated-by'], type(cooler_file.info['generated-by']))) generated_by = toString(cooler_file.info['generated-by']) if 'hic2cool' in generated_by: self.hic2cool_version = generated_by.split('-')[1] - # if self.hic2cool_version >= '0.5': - # log.debug('0.5') - # self.correctionOperator = '/' - # else: - # log.debug('0.4') - - # self.correctionOperator = '*' - # else: - # self.correctionOperator = '*' - - # log.debug('hic2cool: {}'.format(self.hic2cool_version)) - # log.debug('self.correctionOperator : {}'.format(self.correctionOperator)) - elif 'hicmatrix' in generated_by: self.hicmatrix_version = generated_by.split('-')[1] - # # if self.hicmatrix_version >= '8': - # # self.correctionOperator = '/' - # # else: - # # self.correctionOperator = '*' - # else: - # self.correctionOperator = '*' instances_factors *= features_factors log.debug('hic2cool: {}'.format(self.hic2cool_version)) @@ -217,27 +200,6 @@ def load(self): elif self.correctionOperator == '/': matrix.data /= instances_factors - # if self.scaleToOriginalRange is not None: - # min_value = matrix.data.min() - # max_value = matrix.data.max() - # check if max smaller one or if not same mangnitude - # if max_value < 1 or (np.absolute(int(math.log10(max_value)) - int(math.log10(self.maxValue))) > 1): - # desired_range_difference = self.maxValue - self.minValue - - # min_value = matrix.data.min() - # max_value = matrix.data.max() - - # matrix.data = (matrix.data - min_value) - # matrix.data /= (max_value - min_value) - # matrix.data *= desired_range_difference - # matrix.data += self.minValue - # self.scaleToOriginalRange = True - # diff_scale_factor = matrix.data.max() / max_value - # if self.correctionOperator == '*': - # correction_factors *= diff_scale_factor - # if self.correctionOperator == '/': - # correction_factors /= diff_scale_factor - cut_intervals = [] time_start = time.time() log.debug('Creating cut_intervals {}'.format(time_start)) From c019729f3252ba1e4bcd081720b5c815dd4acd9f Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Fri, 8 May 2020 10:52:29 +0000 Subject: [PATCH 05/12] Adding a test case and cleaning up a bit --- hicmatrix/HiCMatrix.py | 26 +++------- hicmatrix/lib/cool.py | 65 ++++++------------------ hicmatrix/test/test_matrixFileHandler.py | 48 ++++++++++++++++- 3 files changed, 68 insertions(+), 71 deletions(-) diff --git a/hicmatrix/HiCMatrix.py b/hicmatrix/HiCMatrix.py index 535223d..2b5312f 100644 --- a/hicmatrix/HiCMatrix.py +++ b/hicmatrix/HiCMatrix.py @@ -51,41 +51,29 @@ def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None, pNoInter fileType = 'h5' self.matrixFileHandler = MatrixFileHandler(pFileType=fileType, pMatrixFile=pMatrixFile, pChrnameList=pChrnameList, pDistance=pDistance) log.debug('init time: {}'.format(time.time() - start_time)) - self.matrix, self.cut_intervals, self.nan_bins, \ - self.correction_factors, self.distance_counts = self.matrixFileHandler.load() - # if len(self.matrix.data) == 0: - # log.warning('No data for {}, not initialization of object. '.format(pChrnameList)) - # self.interval_trees = None - # self.chrBinBoundaries = None - # return - log.debug('load time: {}'.format(time.time() - start_time)) - start_time = time.time() - - log.debug('data loaded from file handler') + matrixFileHandler_load = self.matrixFileHandler.load() + # check if there was any exception thrown in the load function + if len(matrixFileHandler_load) == 2: + raise Exception('Matrix failed to load: {}'.format(str(matrixFileHandler_load[1]))) + else: + self.matrix, self.cut_intervals, self.nan_bins, \ + self.correction_factors, self.distance_counts = matrixFileHandler_load if self.nan_bins is None: self.nan_bins = np.array([]) if pUpperTriangleOnly is None or not pUpperTriangleOnly: self.fillLowerTriangle() - log.debug('triangle time: {}'.format(time.time() - start_time)) start_time = time.time() - log.debug('fillLowerTriangle') self.restoreMaskedBins() - log.debug('restoreMaskedBins: {}'.format(time.time() - start_time)) start_time = time.time() - log.debug('restoreMaskedBins') if pNoIntervalTree is None or not pNoIntervalTree: self.interval_trees, self.chrBinBoundaries = \ self.intervalListToIntervalTree(self.cut_intervals) else: log.debug('no intervaltree') - log.debug('intervalListToIntervalTree: {}'.format(time.time() - start_time)) - start_time = time.time() - - log.debug('intervalListToIntervalTree') elif pMatrixFile is None: log.debug('Only init object, no matrix given.') diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index a7bd604..ce9d6d1 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -32,26 +32,23 @@ def __init__(self, pMatrixFile=None): self.appendData = False self.fileWasH5 = False self.applyCorrectionLoad = True - # self.hic_info = {} self.hic_metadata = {} self.cool_info = None self.hic2cool_version = None self.hicmatrix_version = None self.distance = None - # self.scaleToOriginalRange = None - # self.correction_factors = None def getInformationCoolerBinNames(self): return cooler.Cooler(self.matrixFileName).bins().columns.values + def load(self): log.debug('Load in cool format') self.minValue = None self.maxValue = None if self.matrixFileName is None: - log.info('No matrix is initialized') - + log.warning('No matrix is initialized') try: cooler_file = cooler.Cooler(self.matrixFileName) if 'metadata' in cooler_file.info: @@ -59,16 +56,12 @@ def load(self): else: self.hic_metadata = None self.cool_info = deepcopy(cooler_file.info) - # log.debug("cooler_file.info {}".format(cooler_file.info)) except Exception as e: - log.info("Could not open cooler file. Maybe the path is wrong or the given node is not available.") - log.info('The following file was tried to open: {}'.format(self.matrixFileName)) - log.info("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0]))) - e - log.debug('self.chrnameList {}'.format(self.chrnameList)) + log.warning("Could not open cooler file. Maybe the path is wrong or the given node is not available.") + log.warning('The following file was tried to open: {}'.format(self.matrixFileName)) + log.warning("The following nodes are available: {}".format(cooler.fileops.list_coolers(self.matrixFileName.split("::")[0]))) + return None, e if self.chrnameList is None: - - matrixDataFrame = cooler_file.matrix(balance=False, sparse=True, as_pixels=True) used_dtype = np.int32 if np.iinfo(np.int32).max < cooler_file.info['nbins']: @@ -96,41 +89,36 @@ def load(self): del _data del _instances del _features - gc.collect() matrix = csr_matrix((data, (instances, features)), shape=(np.int(cooler_file.info['nbins']), np.int(cooler_file.info['nbins'])), dtype=count_dtype) - # self.minValue = data.min() - # self.maxValue = data.max() del data del instances del features gc.collect() + else: if len(self.chrnameList) == 1: try: - log.debug('Load data') - log.debug('self.distance {}'.format(self.distance)) - if self.distance is None: + if self.distance is None or cooler_file.binsize is None: + # load the full chromosome matrix = cooler_file.matrix(balance=False, sparse=True, as_pixels=False).fetch(self.chrnameList[0]).tocsr() - else: + # load only the values up to a specific distance lo, hi = cooler_file.extent(self.chrnameList[0]) dist = self.distance // cooler_file.binsize step = (hi - lo) // 32 if step < 1: step = 1 mat = lil_matrix((hi - lo, hi - lo), dtype=np.float32) + for i0, i1 in cooler.util.partition(lo, hi, step): # fetch stripe pixels = cooler_file.matrix(balance=False, as_pixels=True)[i0:i1, lo:hi] # filter pixels = pixels[(pixels['bin2_id'] - pixels['bin1_id']) < dist] # insert into sparse matrix - mat[pixels['bin1_id'] - lo, pixels['bin2_id'] - lo] = pixels['count'].astype(np.float32) - # mat[pixels['bin2_id'] - lo, pixels['bin1_id'] - lo] = pixels['count'] del pixels - gc.collect() matrix = mat.tocsr() del mat @@ -186,10 +174,8 @@ def load(self): log.debug('cooler_file.info[\'generated-by\'] {} {}'.format(cooler_file.info['generated-by'], type(cooler_file.info['generated-by']))) generated_by = toString(cooler_file.info['generated-by']) if 'hic2cool' in generated_by: - self.hic2cool_version = generated_by.split('-')[1] elif 'hicmatrix' in generated_by: - self.hicmatrix_version = generated_by.split('-')[1] instances_factors *= features_factors @@ -202,10 +188,8 @@ def load(self): cut_intervals = [] time_start = time.time() - log.debug('Creating cut_intervals {}'.format(time_start)) for values in cut_intervals_data_frame.values: cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0])) - log.debug('Creating cut_intervals {} DONE'.format(time.time() - time_start)) del cut_intervals_data_frame del correction_factors_data_frame # try to restore nan_bins. @@ -213,7 +197,6 @@ def load(self): shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1] nan_bins = np.arange(shape) nan_bins = np.setdiff1d(nan_bins, matrix.indices) - except Exception: nan_bins = None @@ -267,7 +250,7 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): # https://cooler.readthedocs.io/en/latest/api.html#cooler.Cooler.matrix if (self.hic2cool_version is not None and self.hic2cool_version >= '0.5') or self.fileWasH5 or self.correctionOperator == '/': - log.debug('wash5 true') + log.debug('h5 true') self.correction_factors = np.array(self.correction_factors).flatten() self.correction_factors = 1 / self.correction_factors mask = np.isnan(self.correction_factors) @@ -279,7 +262,7 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): weight = convertNansToOnes(np.array(self.correction_factors).flatten()) bins_data_frame = bins_data_frame.assign(weight=weight) - log.info("Reverting correction factors on matrix...") + log.debug("Reverting correction factors on matrix...") instances, features = self.matrix.nonzero() self.correction_factors = np.array(self.correction_factors) @@ -292,32 +275,14 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): self.matrix.data = self.matrix.data.astype(float) # Apply the invert operation to get the original data - log.debug('self.correctionOperator: {}'.format(self.correctionOperator)) - log.debug('self.fileWasH5: {}'.format(self.fileWasH5)) - - # if self.scaleToOriginalRange: - # min_value = self.matrix.data.min() - # max_value = self.matrix.data.max() - # desired_range_difference = max_value - min_value - - # self.matrix.data = (self.matrix.data - self.minValue) - # self.matrix.data /= (self.maxValue - self.minValue) - # self.matrix.data *= desired_range_difference - # self.matrix.data += min_value - if self.correctionOperator == '*' or self.correctionOperator is None: self.matrix.data /= instances_factors - # elif self.correctionOperator == '/' or self.fileWasH5: - # self.matrix.data *= instances_factors instances_factors = None features_factors = None self.matrix.eliminate_zeros() - log.debug('self.correction_factors {}'.format(self.correction_factors)) - log.debug('pApplyCorrection {}'.format(pApplyCorrection)) - if self.correction_factors is not None and pApplyCorrection is False: dtype_pixel['weight'] = np.float32 weight = convertNansToOnes(np.array(self.correction_factors).flatten()) @@ -338,7 +303,7 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): matrix_data_frame = matrix_data_frame.assign(count=self.matrix.data) if not self.enforceInteger and self.matrix.dtype not in [np.int32, int]: - log.warning("Writing non-standard cooler matrix. Datatype of matrix['count'] is: {}".format(self.matrix.dtype)) + log.debug("Writing non-standard cooler matrix. Datatype of matrix['count'] is: {}".format(self.matrix.dtype)) dtype_pixel['count'] = self.matrix.dtype split_factor = 1 if len(self.matrix.data) > 1e7: @@ -399,4 +364,4 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): fileName = pFileName.split('::')[0] with h5py.File(fileName, 'r+') as h5file: h5file.attrs.update(info) - h5file.close() + h5file.close() \ No newline at end of file diff --git a/hicmatrix/test/test_matrixFileHandler.py b/hicmatrix/test/test_matrixFileHandler.py index 7f184c1..8d0e9bf 100644 --- a/hicmatrix/test/test_matrixFileHandler.py +++ b/hicmatrix/test/test_matrixFileHandler.py @@ -252,6 +252,50 @@ def test_save_cool(): os.unlink(cool_outfile) +def test_load_distance_cool(): + cool_outfile = outfile + '.cool' + + # create matrixFileHandler instance with filetype 'cool' + pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' + fh = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pChrnameList=['1'], pDistance=2500000) + assert fh is not None + + # load data + matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() + # set matrix variables + fh.set_matrix_variables(matrix, cut_intervals, nan_bins, correction_factors, distance_counts) + # and save it. + fh.save(pName=cool_outfile, pSymmetric=True, pApplyCorrection=True) + + fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) + assert fh_test is not None + matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() + + # check distance load works as expected + instances, features = matrix.nonzero() + distances = np.absolute(instances - features) + # log.debug('max: {}'.format(np.max(distances))) + mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance + assert np.sum(mask) == 0 + + fh = MatrixFileHandler(pFileType='cool', pChrnameList=['1'], pMatrixFile=pMatrixFile) + assert fh is not None + + # load data + matrix2, _, _, _, _ = fh.load() + instances, features = matrix2.nonzero() + distances = np.absolute(instances - features) + mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance + assert np.sum(mask) > 0 + + # check if load and save matrix are equal + nt.assert_equal(matrix.data, matrix_test.data) + nt.assert_equal(cut_intervals, cut_intervals_test) + nt.assert_equal(nan_bins, nan_bins_test) + nt.assert_equal(distance_counts, distance_counts_test) + nt.assert_equal(correction_factors, correction_factors_test) + + os.unlink(cool_outfile) def test_load_h5_save_cool(): cool_outfile = outfile + '.cool' @@ -346,7 +390,7 @@ def test_save_cool_enforce_integer(): def test_load_cool_hic2cool_versions(): pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool042.cool' - hic2cool_042 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR') + hic2cool_042 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR', pCorrectionOperator='*') pMatrixFile = ROOT + 'GSE63525_GM12878_insitu_primary_2_5mb_hic2cool051.cool' hic2cool_051 = MatrixFileHandler(pFileType='cool', pMatrixFile=pMatrixFile, pCorrectionFactorTable='KR') @@ -383,7 +427,7 @@ def test_save_cool_apply_division(): fh_new.save(pName=cool_outfile, pSymmetric=False, pApplyCorrection=True) - fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile, pCorrectionOperator='/') + fh_test = MatrixFileHandler(pFileType='cool', pMatrixFile=cool_outfile) assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() pMatrixFile = ROOT + 'Li_et_al_2015.cool' From 5717986a35aa651c1b285f7873bb9c329f98c5d3 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 14 May 2020 17:33:06 +0200 Subject: [PATCH 06/12] No future anymore and testing python 3.7. this one works, however python 3.8 not. Linting; set version number to 13, log level info --- hicmatrix/HiCMatrix.py | 3 +-- hicmatrix/__init__.py | 4 ++-- hicmatrix/_version.py | 2 +- hicmatrix/lib/cool.py | 6 ++---- hicmatrix/test/test_HiCMatrix.py | 18 +++++++++--------- hicmatrix/test/test_matrixFileHandler.py | 20 +++++++++++--------- requirements.txt | 3 +-- setup.py | 8 ++++---- 8 files changed, 31 insertions(+), 33 deletions(-) diff --git a/hicmatrix/HiCMatrix.py b/hicmatrix/HiCMatrix.py index 2b5312f..1edb9ac 100644 --- a/hicmatrix/HiCMatrix.py +++ b/hicmatrix/HiCMatrix.py @@ -65,7 +65,6 @@ def __init__(self, pMatrixFile=None, pChrnameList=None, pDistance=None, pNoInter self.fillLowerTriangle() start_time = time.time() - self.restoreMaskedBins() start_time = time.time() @@ -1007,6 +1006,6 @@ def intervalListToIntervalTree(self, interval_list): def check_cooler(pFileName): - if pFileName.endswith('.cool') or cooler.io.is_cooler(pFileName) or'.mcool' in pFileName: + if pFileName.endswith('.cool') or cooler.io.is_cooler(pFileName) or '.mcool' in pFileName: return True return False diff --git a/hicmatrix/__init__.py b/hicmatrix/__init__.py index 843f7af..7ee7855 100644 --- a/hicmatrix/__init__.py +++ b/hicmatrix/__init__.py @@ -1,5 +1,5 @@ import logging -# logging.basicConfig(level=logging.INFO) -logging.basicConfig(level=logging.DEBUG) +logging.basicConfig(level=logging.INFO) +# logging.basicConfig(level=logging.DEBUG) logging.getLogger('cooler').setLevel(logging.WARNING) diff --git a/hicmatrix/_version.py b/hicmatrix/_version.py index 343d325..0626305 100644 --- a/hicmatrix/_version.py +++ b/hicmatrix/_version.py @@ -1,2 +1,2 @@ -__version__ = '13-dev' +__version__ = '13' # Version number differs from HiCExplorer! diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index d3e851b..ca29c4b 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -42,7 +42,6 @@ def __init__(self, pMatrixFile=None): def getInformationCoolerBinNames(self): return cooler.Cooler(self.matrixFileName).bins().columns.values - def load(self): log.debug('Load in cool format') self.minValue = None @@ -110,7 +109,7 @@ def load(self): if step < 1: step = 1 mat = lil_matrix((hi - lo, hi - lo), dtype=np.float32) - + for i0, i1 in cooler.util.partition(lo, hi, step): # fetch stripe pixels = cooler_file.matrix(balance=False, as_pixels=True)[i0:i1, lo:hi] @@ -187,7 +186,6 @@ def load(self): matrix.data /= instances_factors cut_intervals = [] - time_start = time.time() for values in cut_intervals_data_frame.values: cut_intervals.append(tuple([toString(values[0]), values[1], values[2], 1.0])) del cut_intervals_data_frame @@ -364,4 +362,4 @@ def save(self, pFileName, pSymmetric=True, pApplyCorrection=True): fileName = pFileName.split('::')[0] with h5py.File(fileName, 'r+') as h5file: h5file.attrs.update(info) - h5file.close() \ No newline at end of file + h5file.close() diff --git a/hicmatrix/test/test_HiCMatrix.py b/hicmatrix/test/test_HiCMatrix.py index c1583f2..75c9a64 100644 --- a/hicmatrix/test/test_HiCMatrix.py +++ b/hicmatrix/test/test_HiCMatrix.py @@ -5,7 +5,7 @@ from os import unlink import warnings from six import iteritems -from past.builtins import zip +# from past.builtins import zip from collections import OrderedDict from intervaltree import IntervalTree, Interval @@ -35,8 +35,8 @@ def test_load_h5_save_and_load_cool(): hic_cool = hm.hiCMatrix(outfile.name) nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) - chrom_cool, start_cool, end_cool, _ = zip(*hic_cool.cut_intervals) - chrom, start, end, _ = zip(*hic_cool.cut_intervals) + chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) + chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) nt.assert_equal(chrom_cool, chrom) nt.assert_equal(start_cool, start) @@ -53,8 +53,8 @@ def test_load_h5_save_and_load_cool_2(): hic_cool = hm.hiCMatrix(outfile.name) nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) - chrom_cool, start_cool, end_cool, _ = zip(*hic_cool.cut_intervals) - chrom, start, end, _ = zip(*hic_cool.cut_intervals) + chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) + chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) nt.assert_equal(chrom_cool, chrom) nt.assert_equal(start_cool, start) @@ -71,8 +71,8 @@ def test_load_cool_save_and_load_h5(): hic_cool = hm.hiCMatrix(outfile.name) nt.assert_equal(hic_cool.matrix.data, hic.matrix.data) - chrom_cool, start_cool, end_cool, _ = zip(*hic_cool.cut_intervals) - chrom, start, end, _ = zip(*hic_cool.cut_intervals) + chrom_cool, start_cool, end_cool, _ = list(zip(*hic_cool.cut_intervals)) + chrom, start, end, _ = list(zip(*hic_cool.cut_intervals)) nt.assert_equal(chrom_cool, chrom) nt.assert_equal(start_cool, start) @@ -201,8 +201,8 @@ def test_convert_to_zscore_matrix_2(): m_size = mat.shape[0] # compute matrix values per distance - chrom, start, end, extra = zip( - *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals)) + chrom, start, end, extra = list(zip( + *hm.hiCMatrix.fit_cut_intervals(hic.cut_intervals))) dist_values = {} sys.stderr.write("Computing values per distance for each matrix entry\n") diff --git a/hicmatrix/test/test_matrixFileHandler.py b/hicmatrix/test/test_matrixFileHandler.py index 8d0e9bf..ece135a 100644 --- a/hicmatrix/test/test_matrixFileHandler.py +++ b/hicmatrix/test/test_matrixFileHandler.py @@ -207,12 +207,12 @@ def test_load_cool2(capsys): nt.assert_almost_equal(matrix[0].todense(), test_matrix) test_cut_intervals = sum([[('chr1', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3909)], - [('chr1', 195450000, 195471971, 1.0)], - [('chrX', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3420)], - [('chrX', 171000000, 171031299, 1.0)], - [('chrY', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(1834)], - [('chrY', 91700000, 91744698, 1.0)], - [('chrM', 0, 16299, 1.0)]], []) + [('chr1', 195450000, 195471971, 1.0)], + [('chrX', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(3420)], + [('chrX', 171000000, 171031299, 1.0)], + [('chrY', i * bin_size, (i + 1) * bin_size, 1.0) for i in range(1834)], + [('chrY', 91700000, 91744698, 1.0)], + [('chrM', 0, 16299, 1.0)]], []) for index, tup in enumerate(cut_intervals): for ind, element in enumerate(tup): @@ -252,6 +252,7 @@ def test_save_cool(): os.unlink(cool_outfile) + def test_load_distance_cool(): cool_outfile = outfile + '.cool' @@ -275,17 +276,17 @@ def test_load_distance_cool(): instances, features = matrix.nonzero() distances = np.absolute(instances - features) # log.debug('max: {}'.format(np.max(distances))) - mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance + mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance assert np.sum(mask) == 0 - fh = MatrixFileHandler(pFileType='cool', pChrnameList=['1'], pMatrixFile=pMatrixFile) + fh = MatrixFileHandler(pFileType='cool', pChrnameList=['1'], pMatrixFile=pMatrixFile) assert fh is not None # load data matrix2, _, _, _, _ = fh.load() instances, features = matrix2.nonzero() distances = np.absolute(instances - features) - mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance + mask = distances > 1 # 2.5 mb res --> all with 2.5 Mb distance assert np.sum(mask) > 0 # check if load and save matrix are equal @@ -297,6 +298,7 @@ def test_load_distance_cool(): os.unlink(cool_outfile) + def test_load_h5_save_cool(): cool_outfile = outfile + '.cool' diff --git a/requirements.txt b/requirements.txt index 414abff..53614ad 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,5 @@ numpy >= 1.16.* scipy >= 1.2.* pandas = 0.25.* pytables >= 3.5.* -future = 0.17.* -cooler = 0.8.5 +cooler >= 0.8.* intervaltree = 3.0.* diff --git a/setup.py b/setup.py index c2bdf6a..71347cd 100644 --- a/setup.py +++ b/setup.py @@ -96,16 +96,16 @@ def checkProgramIsInstalled(self, program, args, where_to_download, install_requires_py = ["numpy >= 1.16.*", "scipy >= 1.2.*", "tables >= 3.5.*", - "pandas >= 0.24.*", - "cooler == 0.8.5", + "pandas == 0.25.*", + "cooler >= 0.8.*", "intervaltree == 3.0.*" ] setup( name='HiCMatrix', version=get_version(), - author='Fidel Ramirez, Vivek Bhardwaj, Björn Grüning, Joachim Wolff', - author_email='deeptools@googlegroups.com', + author='Joachim Wolff, Leily Rabbani, Vivek Bhardwaj, Fidel Ramirez', + author_email='wolffj@informatik.uni-freiburg.de', packages=find_packages(), include_package_data=True, From f823f691af5f2938e6093112fb56385dfa235dba Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 14 May 2020 18:10:16 +0200 Subject: [PATCH 07/12] gzip support to read homer files. close #9 --- hicmatrix/lib/homer.py | 18 ++++++++-------- hicmatrix/test/test_matrixFileHandler.py | 26 +++++++++++++++++++++++- hicmatrix/utilities.py | 16 +++++++++++++++ 3 files changed, 51 insertions(+), 9 deletions(-) diff --git a/hicmatrix/lib/homer.py b/hicmatrix/lib/homer.py index 9b60a1e..aad6232 100644 --- a/hicmatrix/lib/homer.py +++ b/hicmatrix/lib/homer.py @@ -5,6 +5,7 @@ from scipy.sparse import csr_matrix from .matrixFile import MatrixFile +from hicmatrix.utilities import opener class Homer(MatrixFile, object): @@ -15,26 +16,27 @@ def __init__(self, pMatrixFile): def load(self): cut_intervals = [] - with open(self.matrixFileName, 'r') as matrix_file: + # matrix_file = opener(self.matrixFileName) + with opener(self.matrixFileName) as matrix_file: values = matrix_file.readline() - values = values.strip().split('\t') + values = values.strip().split(b'\t') # get bin size - start_first = int(values[2].strip().split('-')[1]) - start_second = int(values[3].strip().split('-')[1]) + start_first = int(values[2].strip().split(b'-')[1]) + start_second = int(values[3].strip().split(b'-')[1]) bin_size = start_second - start_first for i, value in enumerate(values[2:]): - chrom, start = value.strip().split('-') - cut_intervals.append((chrom, int(start), int(start) + bin_size, 1)) + chrom, start = value.strip().split(b'-') + cut_intervals.append((chrom.decode('ascii'), int(start), int(start) + bin_size, 1)) matrix_dense = [] for line in matrix_file: - values = line.split('\t') + values = line.split(b'\t') data = [] for i, value in enumerate(values[2:]): data.append(float(value)) matrix_dense.append(data) - + # matrix_file.close() matrix = csr_matrix(matrix_dense) nan_bins = None distance_counts = None diff --git a/hicmatrix/test/test_matrixFileHandler.py b/hicmatrix/test/test_matrixFileHandler.py index ece135a..a7686c0 100644 --- a/hicmatrix/test/test_matrixFileHandler.py +++ b/hicmatrix/test/test_matrixFileHandler.py @@ -11,7 +11,7 @@ outfile = '/tmp/matrix' -def test_load_homer(capsys): +def test_load_homer(): # create matrixFileHandler instance with filetype 'homer' pMatrixFile = ROOT + 'test_matrix.homer' fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) @@ -35,6 +35,30 @@ def test_load_homer(capsys): assert correction_factors is None +def test_load_homer_gzip(): + # create matrixFileHandler instance with filetype 'homer' + pMatrixFile = ROOT + 'test_matrix.homer.gz' + fh = MatrixFileHandler(pFileType='homer', pMatrixFile=pMatrixFile) + assert fh is not None + + # load data + matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() + + # create test matrix + + test_matrix = np.array([[1.0, 0.1896, 0.2163, 0.08288, 0.1431, 0.2569, 0.1315, + 0.1488, -0.0312, 0.143, 0.06091, 0.03546, 0.1168]]) + + nt.assert_almost_equal(matrix[0].todense(), test_matrix) + + test_cut_intervals = [('3R', 1000000, 1020000, 1), ('3R', 1020000, 1040000, 1), ('3R', 1040000, 1060000, 1), ('3R', 1060000, 1080000, 1), ('3R', 1080000, 1100000, 1), ('3R', 1100000, 1120000, 1), ('3R', 1120000, 1140000, 1), ('3R', 1140000, 1160000, 1), ('3R', 1160000, 1180000, 1), ('3R', 1180000, 1200000, 1), ('3R', 1200000, 1220000, 1), ('3R', 1220000, 1240000, 1), ('3R', 1240000, 1260000, 1)] # noqa E501 + nt.assert_equal(cut_intervals, test_cut_intervals) + + assert nan_bins is None + assert distance_counts is None + assert correction_factors is None + + def test_save_homer(): homer_outfile = outfile + '.homer' diff --git a/hicmatrix/utilities.py b/hicmatrix/utilities.py index b4b3e44..c388a8b 100644 --- a/hicmatrix/utilities.py +++ b/hicmatrix/utilities.py @@ -1,5 +1,6 @@ import numpy as np import sys +import gzip def toString(s): @@ -98,3 +99,18 @@ def enlarge_bins(bin_intervals): bin_intervals[-1] = (chrom, start, end, extra) return bin_intervals + + +def opener(filename): + """ + Determines if a file is compressed or not + """ + f = open(filename, 'rb') + # print("gzip or not?", f.read(2)) + + if f.read(2) == b'\x1f\x8b': + f.seek(0) + return gzip.GzipFile(fileobj=f) + else: + f.seek(0) + return f From ab1d2113bb59a6e5c15b8811141416091cee5446 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 14 May 2020 18:55:26 +0200 Subject: [PATCH 08/12] Detect nan only weight column in another way --- hicmatrix/lib/cool.py | 9 +++++---- hicmatrix/utilities.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index ca29c4b..2c0b68e 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -15,7 +15,7 @@ import pandas as pd from hicmatrix.utilities import toString, toBytes -from hicmatrix.utilities import convertNansToOnes +from hicmatrix.utilities import convertNansToOnes, convertNansToZeros from hicmatrix._version import __version__ from .matrixFile import MatrixFile @@ -155,9 +155,10 @@ def load(self): matrix.data = matrix.data.astype(float) - correction_factors = convertNansToOnes(np.array(correction_factors_data_frame.values).flatten()) - # apply only if there are not only 1's - if np.sum(correction_factors) != len(correction_factors): + correction_factors = np.array(correction_factors_data_frame.values).flatten() + # Don't apply correction if weight were just 'nans' + if np.sum(np.isnan(correction_factors)) != len(correction_factors): + # correction_factors = convertNansToZeros(correction_factors) matrix.sort_indices() instances, features = matrix.nonzero() diff --git a/hicmatrix/utilities.py b/hicmatrix/utilities.py index c388a8b..a5d1ba3 100644 --- a/hicmatrix/utilities.py +++ b/hicmatrix/utilities.py @@ -54,7 +54,7 @@ def check_chrom_str_bytes(pIteratableObj, pObj): def convertNansToZeros(ma): nan_elements = np.flatnonzero(np.isnan(ma.data)) if len(nan_elements) > 0: - ma.data[nan_elements] = 0 + ma.data[nan_elements] = 0.0 return ma From ee61997e9d4058322f5114d136afa08132df8057 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 14 May 2020 18:56:26 +0200 Subject: [PATCH 09/12] Change curl to curl -L --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index d5fde17..1d4dfbe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -30,8 +30,8 @@ before_install: - export HIC_TEST_DATA_DIR="`pwd`/hicmatrix/test/test_data/" - echo $HIC_TEST_DATA_DIR -- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then curl https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o miniconda.sh ; fi -- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then curl https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o miniconda.sh ; fi +- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then curl -L https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -o miniconda.sh ; fi +- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then curl -L https://repo.continuum.io/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -o miniconda.sh ; fi - bash miniconda.sh -b -p $HOME/miniconda - PATH_WITHOUT_CONDA="$PATH" - export PATH="$HOME/miniconda/bin:$PATH" From 2c77f332430ac231702ffe52f129a4c20539eb73 Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Thu, 14 May 2020 19:29:55 +0200 Subject: [PATCH 10/12] Adding test data for homer gzip, adding nan_bins for nan correction factors --- hicmatrix/lib/cool.py | 5 +++++ hicmatrix/test/test_data/test_matrix.homer.gz | Bin 0 -> 647 bytes hicmatrix/test/test_matrixFileHandler.py | 4 ++++ 3 files changed, 9 insertions(+) create mode 100644 hicmatrix/test/test_data/test_matrix.homer.gz diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index 2c0b68e..fa1eedf 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -193,6 +193,11 @@ def load(self): del correction_factors_data_frame # try to restore nan_bins. try: + # remove possible nan bins introduced by the correction factors + # to have them part of the nan_bins vector + mask = matrix.data == np.nan + matrix.data[mask] = 0 + matrix.eliminate_zeros() shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1] nan_bins = np.arange(shape) nan_bins = np.setdiff1d(nan_bins, matrix.indices) diff --git a/hicmatrix/test/test_data/test_matrix.homer.gz b/hicmatrix/test/test_data/test_matrix.homer.gz new file mode 100644 index 0000000000000000000000000000000000000000..351a6bdecb5f601a6f616627864f3d07061ffa8b GIT binary patch literal 647 zcmV;20(ku&iwFpfjd)%F19W9`bYE>@baH8UE@*FUWpV&rRlAN9F%XP*zJd^h*0|g6 zfCvH-k`q60g4Jn~J84-7_cz*f1Uq5{~JzZY*x7XjN@N_0W{w8F=2$%uuf`l?c8KI0&Mkr&HG0J$9ZAUs2 zlI#C39|x)}U(TLR>?ntUs^eNa;=UUL$!^MRAaa3{pg6vx(>b_KW0m-^n4NDl)`Ckh z%P!?nvK_@?A*OUdF}U33fgNStjY(09UMW(MwM0srYLvCYx|_?aJ>Ng%LF}CSFOC~* zN3SM?Jm6zFR;qGh4M9-M%1kxswSw;A>36y5ccjd+3qDO|T{BM5%tU8G*adyS$8zMT zR_7S?L|>$WV$&OBWyVEOs;ruMv6?$-kPL|wi1ffN$OAf{$SYCo8Y`m2Dqc;p7KGTn z_K0E^J}q9htGrl=EF(NH3*smnE26K;Kl&=jvj!q%MNPuktz?>l6_-i`G1KFker+ve z8QFnX5C?P~Mbrzl3dglJhyk6Ak27M?WzQgHCQei*mlE1qh_a0iyi#>Q$EsM0He+G+ zBF3Z<3u%Tw_qZ6v%>0|s3?AF+XswE-tvW>1Ey!c}U@~%Rk|&BRXQWgHs;ysM4d+(O5{bW(Q_jb(D=Yx^fmPLm{0Ofq6{}`!-C7sf;;@xr08pjn(Lw9hhas zQ92JZXU$}v3B1fhC2+kFk5k! Date: Fri, 15 May 2020 10:42:30 +0200 Subject: [PATCH 11/12] Fixing #25, #23. Thanks @lldelisle --- hicmatrix/lib/cool.py | 18 ++++++++++++------ hicmatrix/test/test_HiCMatrix.py | 3 +-- hicmatrix/test/test_matrixFileHandler.py | 12 ++++++------ 3 files changed, 19 insertions(+), 14 deletions(-) diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index fa1eedf..6673e3b 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -157,7 +157,7 @@ def load(self): correction_factors = np.array(correction_factors_data_frame.values).flatten() # Don't apply correction if weight were just 'nans' - if np.sum(np.isnan(correction_factors)) != len(correction_factors): + if np.sum(np.isnan(matrix.data)) != len(correction_factors): # correction_factors = convertNansToZeros(correction_factors) matrix.sort_indices() @@ -193,14 +193,20 @@ def load(self): del correction_factors_data_frame # try to restore nan_bins. try: - # remove possible nan bins introduced by the correction factors + # remove possible nan bins introduced by the correction factors # to have them part of the nan_bins vector - mask = matrix.data == np.nan - matrix.data[mask] = 0 + mask = np.isnan(matrix.data) + matrix.data[mask] = 0 matrix.eliminate_zeros() shape = matrix.shape[0] if matrix.shape[0] < matrix.shape[1] else matrix.shape[1] - nan_bins = np.arange(shape) - nan_bins = np.setdiff1d(nan_bins, matrix.indices) + nan_bins_indices = np.arange(shape) + nan_bins_indices = np.setdiff1d(nan_bins_indices, matrix.indices) + + nan_bins = [] + for bin_id in nan_bins_indices: + if len(matrix[bin_id, :].data) == 0: + nan_bins.append(bin_id) + nan_bins = np.array(nan_bins) except Exception: nan_bins = None diff --git a/hicmatrix/test/test_HiCMatrix.py b/hicmatrix/test/test_HiCMatrix.py index 75c9a64..181c9f0 100644 --- a/hicmatrix/test/test_HiCMatrix.py +++ b/hicmatrix/test/test_HiCMatrix.py @@ -1146,8 +1146,7 @@ def test_create_from_cool(): hic_ma = hm.hiCMatrix(ROOT + 'one_interaction_4chr.cool') nt.assert_equal(sorted(hic_ma.matrix.indices), [0, 3]) nt.assert_equal(sorted(hic_ma.matrix.data), [1, 1]) - # This is failing: - # nt.assert_equal(sorted(hic_ma.nan_bins)[:5], [1, 2, 4, 5, 6]) + nt.assert_equal(sorted(hic_ma.nan_bins)[:5], [1, 2, 4, 5, 6]) hic_ma = hm.hiCMatrix(ROOT + 'one_interaction_diag_4chr.cool') nt.assert_equal(sorted(hic_ma.matrix.indices), [0]) nt.assert_equal(sorted(hic_ma.matrix.data), [1]) diff --git a/hicmatrix/test/test_matrixFileHandler.py b/hicmatrix/test/test_matrixFileHandler.py index 7298d18..695d1de 100644 --- a/hicmatrix/test/test_matrixFileHandler.py +++ b/hicmatrix/test/test_matrixFileHandler.py @@ -242,7 +242,7 @@ def test_load_cool2(capsys): for ind, element in enumerate(tup): assert element == test_cut_intervals[index][ind] - test_nan_bins = [0, 1, 2, 4] + test_nan_bins = [1, 2, 4, 5] nt.assert_almost_equal(nan_bins[:4], test_nan_bins) assert distance_counts is None @@ -392,9 +392,9 @@ def test_save_cool_enforce_integer(): assert fh_test is not None matrix_test, cut_intervals_test, nan_bins_test, distance_counts_test, correction_factors_test = fh_test.load() - pMatrixFile = ROOT + 'Li_et_al_2015.h5' - fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) - assert fh is not None + # pMatrixFile = ROOT + 'Li_et_al_2015.h5' + # fh = MatrixFileHandler(pFileType='h5', pMatrixFile=pMatrixFile) + # assert fh is not None # load data # matrix, cut_intervals, nan_bins, distance_counts, correction_factors = fh.load() @@ -405,9 +405,9 @@ def test_save_cool_enforce_integer(): # matrix_applied_correction = matrix.data / instances_factors # mask = matrix.data == 0 + matrix.data = np.rint(matrix.data) matrix.eliminate_zeros() - matrix_test.eliminate_zeros() - + # matrix_test.eliminate_zeros() nt.assert_almost_equal(matrix.data, matrix_test.data, decimal=0) nt.assert_equal(len(cut_intervals), len(cut_intervals_test)) From 79072f67957345108c325edf0d792fe1af453bca Mon Sep 17 00:00:00 2001 From: Joachim Wolff Date: Fri, 15 May 2020 10:51:13 +0200 Subject: [PATCH 12/12] Bug fix --- hicmatrix/lib/cool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hicmatrix/lib/cool.py b/hicmatrix/lib/cool.py index 6673e3b..af18f86 100644 --- a/hicmatrix/lib/cool.py +++ b/hicmatrix/lib/cool.py @@ -157,7 +157,7 @@ def load(self): correction_factors = np.array(correction_factors_data_frame.values).flatten() # Don't apply correction if weight were just 'nans' - if np.sum(np.isnan(matrix.data)) != len(correction_factors): + if np.sum(np.isnan(correction_factors)) != len(correction_factors): # correction_factors = convertNansToZeros(correction_factors) matrix.sort_indices()