-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtutorialextras.py
160 lines (126 loc) · 8.09 KB
/
tutorialextras.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import pandas
from datetime import datetime
import numpy
from nPYc.enumerations import VariableType, DatasetLevel, AssayRole, SampleType
def matchBasicCSV(dataset, filePath):
"""
Do a basic join of the data in the csv file at filePath to the :py:attr:`sampleMetadata` dataframe on the 'Sample File Name'.
"""
dateparse = lambda x: datetime.strptime(x, '%d/%m/%Y %H:%M')
csvData = pandas.read_csv(filePath, dtype={'Sample File Name':str, 'Sample ID': str}, parse_dates=['Acquired Time'], date_parser=dateparse)
currentMetadata = dataset.sampleMetadata.copy()
if 'Sample File Name' not in csvData.columns:
raise KeyError("No 'Sample File Name' column present, unable to join tables.")
# Check if there are any duplicates in the csv file
u_ids, u_counts = numpy.unique(csvData['Sample File Name'], return_counts=True)
if any(u_counts > 1):
warnings.warn('Check and remove duplicates in CSV file')
return
# Store previous AssayRole and SampleType in case they were parsed using from filename:
#
oldAssayRole = currentMetadata['AssayRole']
oldSampleType = currentMetadata['SampleType']
oldDilution = currentMetadata['Dilution']
##
# If colums exist in both csv data and dataset.sampleMetadata remove them from sampleMetadata
##
columnsToRemove = csvData.columns
columnsToRemove = columnsToRemove.drop(['Sample File Name'])
for column in columnsToRemove:
if column in currentMetadata.columns:
currentMetadata.drop(column, axis=1, inplace=True)
# If AssayRole or SampleType columns are present parse strings into enums
csvData['AssayRole'] = [(x.replace(" ", "")).lower() if type(x) is str else numpy.nan for x in csvData['AssayRole']]
csvData['SampleType'] = [(x.replace(" ", "")).lower() if type(x) is str else numpy.nan for x in csvData['SampleType']]
if 'AssayRole' in csvData.columns:
for role in AssayRole:
csvData.loc[csvData['AssayRole'].values == (str(role).replace(" ", "")).lower(), 'AssayRole'] = role
if 'SampleType' in csvData.columns:
for stype in SampleType:
csvData.loc[csvData['SampleType'].values == (str(stype).replace(" ", "")).lower(), 'SampleType'] = stype
# If Acquired Time column is in the CSV file, reformat data to allow operations on timestamps and timedeltas,
# which are used in some plotting functions
if 'Acquired Time' in csvData:
csv_datetime = pandas.to_datetime(csvData['Acquired Time'], errors='ignore')
# msData.sampleMetadata['Acquired Time'] = z
csv_datetime = csv_datetime.dt.strftime('%d-%b-%Y %H:%M:%S')
csvData['Acquired Time'] = csv_datetime.apply(lambda x: datetime.strptime(x, '%d-%b-%Y %H:%M:%S')).astype('O')
# Left join, without sort, so the intensityData matrix and the sample Masks are kept in order
# Preserve information about sample mask alongside merge even on the case of samples missing from CSV file.
# Is this required?? Masked field doesn't seem to be used anywhere else
currentMetadata['Masked'] = False
currentMetadata.loc[(dataset.sampleMask == False), 'Masked'] = True
joinedTable = pandas.merge(currentMetadata, csvData, how='left', left_on='Sample File Name',
right_on='Sample File Name', sort=False)
merged_samples = pandas.merge(currentMetadata, csvData, how='inner', left_on='Sample File Name',
right_on='Sample File Name', sort=False)
merged_samples = merged_samples['Sample File Name']
merged_indices = joinedTable[joinedTable['Sample File Name'].isin(merged_samples)].index
# Samples in the CSV file but not acquired will go for sampleAbsentMetadata, for consistency with NPC Lims import
csv_butnotacq = csvData.loc[csvData['Sample File Name'].isin(currentMetadata['Sample File Name']) == False, :]
if csv_butnotacq.shape[0] != 0:
sampleAbsentMetadata = csv_butnotacq.copy(deep=True)
# Removed normalised index columns
# Enum masks describing the data in each row
sampleAbsentMetadata.loc[:, 'SampleType'] = SampleType.StudySample
sampleAbsentMetadata.loc[sampleAbsentMetadata['SampleType'].str.match('StudyPool', na=False).astype(
bool), 'SampleType'] = SampleType.StudyPool
sampleAbsentMetadata.loc[sampleAbsentMetadata['SampleType'].str.match('ExternalReference', na=False).astype(
bool), 'SampleType'] = SampleType.ExternalReference
sampleAbsentMetadata.loc[:, 'AssayRole'] = AssayRole.Assay
sampleAbsentMetadata.loc[sampleAbsentMetadata['AssayRole'].str.match('PrecisionReference', na=False).astype(
bool), 'AssayRole'] = AssayRole.PrecisionReference
sampleAbsentMetadata.loc[sampleAbsentMetadata['AssayRole'].str.match('LinearityReference', na=False).astype(
bool), 'AssayRole'] = AssayRole.LinearityReference
# Remove duplicate columns (these will be appended with _x or _y)
cols = [c for c in sampleAbsentMetadata.columns if c[-2:] != '_y']
sampleAbsentMetadata = sampleAbsentMetadata[cols]
sampleAbsentMetadata.rename(columns=lambda x: x.replace('_x', ''), inplace=True)
dataset.sampleAbsentMetadata = sampleAbsentMetadata
# By default everything in the CSV has metadata available and samples mentioned there will not be masked
# unless Include Sample field was == False
joinedTable.loc[merged_indices, 'Metadata Available'] = True
# Samples in the folder and processed but not mentioned in the CSV.
acquired_butnotcsv = currentMetadata.loc[(currentMetadata['Sample File Name'].isin(csvData['Sample File Name']) == False), :]
# Ensure that acquired but no csv only counts samples which 1 are not in CSV and 2 - also have no other kind of
# AssayRole information provided (from parsing filenames for example)
if acquired_butnotcsv.shape[0] != 0:
noMetadataIndex = acquired_butnotcsv.index
# Find samples where metadata was there previously and is not on the new CSV
previousMetadataAvailable = currentMetadata.loc[(~oldSampleType.isnull()) & (~oldAssayRole.isnull())
& ((currentMetadata['Sample File Name'].isin(csvData['Sample File Name']) == False)), :].index
metadataNotAvailable = [x for x in noMetadataIndex if x not in previousMetadataAvailable]
# Keep old AssayRoles and SampleTypes for cases not mentioned in CSV for which this information was previously
# available
joinedTable.loc[previousMetadataAvailable, 'AssayRole'] = oldAssayRole[previousMetadataAvailable]
joinedTable.loc[previousMetadataAvailable, 'SampleType'] = oldSampleType[previousMetadataAvailable]
joinedTable.loc[previousMetadataAvailable, 'Dilution'] = oldDilution[previousMetadataAvailable]
# If not in the new CSV, but previously there, keep it and don't mask
if len(metadataNotAvailable) > 0:
joinedTable.loc[metadataNotAvailable, 'Metadata Available'] = False
# dataset.sampleMask[metadataNotAvailable] = False
# joinedTable.loc[metadataNotAvailable, 'Exclusion Details'] = 'No Metadata in CSV'
# 1) ACQ and in "include Sample" - drop and set mask to false
# Samples Not ACQ and in "include Sample" set to False - drop and ignore from the dataframe
# Remove acquired samples where Include sample column equals false - does not remove, just masks the sample
if 'Include Sample' in csvData.columns:
which_to_drop = joinedTable[joinedTable['Include Sample'] == False].index
#dataset.intensityData = numpy.delete(dataset.intensityData, which_to_drop, axis=0)
#dataset.sampleMask = numpy.delete(dataset.sampleMask, which_to_drop)
dataset.sampleMask[which_to_drop] = False
#joinedTable.drop(which_to_drop, axis=0, inplace=True)
joinedTable.drop('Include Sample', inplace=True, axis=1)
previously_masked = joinedTable[joinedTable['Masked'] == True].index
dataset.sampleMask[previously_masked] = False
joinedTable.drop('Masked', inplace=True, axis=1)
# Regenerate the dataframe index for joined table
joinedTable.reset_index(inplace=True, drop=True)
dataset.sampleMetadata = joinedTable
# Commented out as we shouldn't need this here after removing the LIMS, but lets keep it
# This should make it work - but its assuming the sample "NAME" is the same as File name as in LIMS.
dataset.sampleMetadata['Sample Base Name'] = dataset.sampleMetadata['Sample File Name']
# Ensure there is a batch column
if 'Batch' not in dataset.sampleMetadata:
dataset.sampleMetadata['Batch'] = 1
dataset.Attributes['Log'].append([datetime.now(), 'Basic CSV matched from %s' % (filePath)])
return dataset