From 2cd22808f4df11a50ba0500335b2b60b78fd4be1 Mon Sep 17 00:00:00 2001 From: bus33060 Date: Thu, 23 May 2024 12:53:50 +0200 Subject: [PATCH] implemented augmentation step and added h5 file structure viewer --- POTATO_ForceRamp.py | 91 +++++++++++++++++++++-------------------- POTATO_GUI.py | 75 ++++++++++++++++++++++++++++----- POTATO_find_steps.py | 26 ++++++++++++ POTATO_preprocessing.py | 60 +++++++++++++++++++++------ 4 files changed, 185 insertions(+), 67 deletions(-) diff --git a/POTATO_ForceRamp.py b/POTATO_ForceRamp.py index c0ddd89..962e0fd 100644 --- a/POTATO_ForceRamp.py +++ b/POTATO_ForceRamp.py @@ -4,6 +4,7 @@ import h5py import numpy as np from pathlib import Path +import lumicks.pylake as lk # relative imports from POTATO_fitting import fitting_ds, fitting_ss, plot_fit @@ -15,6 +16,12 @@ """define the functions of the subprocess processing the data""" +def show_h5_structure(file_path): + file_h5 = lk.File(file_path) + + return file_h5 + + def read_in_data(file_num, Files, input_settings, input_format): if input_format['CSV'] == 1: df = pd.read_csv(Files[file_num]) @@ -27,12 +34,8 @@ def read_in_data(file_num, Files, input_settings, input_format): else: Distance = df.to_numpy()[:, 1] / 1000 # accessing the data frequency from user input - Frequency_value = input_settings['data_frequency'] - if input_format['preprocess'] == 1: - Force_Distance, Force_Distance_um = preprocess_RAW(Force, Distance, input_settings) - else: - Force_Distance = np.column_stack((Force, Distance * 1000)) - Force_Distance_um = np.column_stack((Force, Distance)) + Frequency_value = input_settings['data_frequency'] + Force_Distance, Force_Distance_um = preprocess_RAW(Force, Distance, input_settings, input_format) else: with h5py.File(Files[file_num], "r") as f: @@ -48,11 +51,7 @@ def read_in_data(file_num, Files, input_settings, input_format): Distance = f.get("Distance/Piezo Distance") # accessing the data frequency from the h5 file Frequency_value = Force.attrs['Sample rate (Hz)'] - if input_format['preprocess'] == 1: - Force_Distance, Force_Distance_um = preprocess_RAW(Force, Distance, input_settings) - else: - Force_Distance = np.column_stack((Force, Distance * 1000)) - Force_Distance_um = np.column_stack((Force, Distance)) + Force_Distance, Force_Distance_um = preprocess_RAW(Force, Distance, input_settings, input_format) elif input_format['LF'] == 1: if input_format['Trap'] == 1: @@ -71,11 +70,9 @@ def read_in_data(file_num, Files, input_settings, input_format): except: load_distance = f.get("Distance/Distance 1")[:] Distance = load_distance['Value'][:] - if input_format['preprocess'] == 1: - Force_Distance, Force_Distance_um = preprocess_RAW(Force, Distance, input_settings) - else: - Force_Distance = np.column_stack((Force, Distance * 1000)) - Force_Distance_um = np.column_stack((Force, Distance)) + + Force_Distance, Force_Distance_um = preprocess_RAW(Force, Distance, input_settings, input_format) + # calculating the data frequency based on start- and end-time of the measurement size_F_LF = len(Force) stop_time_F_LF = load_force.attrs['Stop time (ns)'] @@ -186,9 +183,11 @@ def start_subprocess(analysis_folder, timestamp, Files, input_settings, input_fo filename_i = filename + '_' + suffix Force_Distance = curves[x][:, :2] + print('################ FD', len(Force_Distance)) Force_Distance_um = np.copy(Force_Distance) Force_Distance_um[:, 1] = Force_Distance_um[:, 1] / 1000 - ###### Detect MultiFiles ###### + ###### Detect MultiFiles end ###### + orientation = "forward" if Force_Distance[0, 1] > Force_Distance[-1, 1]: # reverse orientation = "reverse" @@ -204,39 +203,39 @@ def start_subprocess(analysis_folder, timestamp, Files, input_settings, input_fo # trim data below specified force thresholds F_trimmed, PD_trimmed, F_low = trim_data(Force_Distance, input_settings['F_min']) - + print('#################### Trimmmed', len(F_trimmed)) if not F_trimmed.size == 0: # create force and distance derivative of the pre-processed data to be able to identify steps derivative_array = create_derivative(input_settings, Frequency_value, F_trimmed, PD_trimmed, F_low) - + print('################### der array', len(derivative_array)) """find steps based on force derivative""" filename_results = analysis_folder + "/" + filename_i + "_results_" + timestamp + ".csv" - try: - results_F, PD_start_F = find_steps_F( - input_settings, - filename_i, - Force_Distance, - derivative_array, - orientation - ) - - results_F_list = list(results_F) + # try: + results_F, PD_start_F = find_steps_F( + input_settings, + filename_i, + Force_Distance, + derivative_array, + orientation + ) - if export_data['export_STEPS'] == 1: - steps_results_F = pd.DataFrame(results_F_list) - with open(filename_results, 'a+') as f: - f.write('\nSteps found by force derivative:\n') - steps_results_F.to_csv(filename_results, mode='a', index=False, header=True) - else: - pass + results_F_list = list(results_F) - except: - results_F = [] - PD_start_F = [] - print("Error in finding steps for file " + str(filename_i) + '\n' 'There was an error in finding Force steps') + if export_data['export_STEPS'] == 1: + steps_results_F = pd.DataFrame(results_F_list) + with open(filename_results, 'a+') as f: + f.write('\nSteps found by force derivative:\n') + steps_results_F.to_csv(filename_results, mode='a', index=False, header=True) + else: pass + # except: + # results_F = [] + # PD_start_F = [] + # print("Error in finding steps for file " + str(filename_i) + '\n' 'There was an error in finding Force steps') + # pass + """find steps based on distance derivative""" try: @@ -302,11 +301,11 @@ def start_subprocess(analysis_folder, timestamp, Files, input_settings, input_fo common_steps_results.to_csv(filename_results, mode='a', index=False, header=True) # put common steps into a total_results dataframe so all steps from all files of the analysed folder can be exported together - total_results_steps = total_results_steps.append(common_steps_results, ignore_index=True, sort=False) + total_results_steps = pd.concat([total_results_steps, common_steps_results], ignore_index=True, sort=False) else: - common_steps_results = [{'filename': filename_i, 'orientation': orientation, 'Derivative of': '', 'step #': 0, 'F1': '', 'F2': '', 'Fc': '', 'step start': '', 'step end': '', 'step length': ''}] - total_results_steps = total_results_steps.append(common_steps_results, ignore_index=True, sort=False) + common_steps_results = pd.DataFrame({'filename': filename_i, 'orientation': orientation, 'Derivative of': '', 'step #': 0, 'F1': '', 'F2': '', 'Fc': '', 'step start': '', 'step end': '', 'step length': ''}, index=[0]) + total_results_steps = pd.concat([total_results_steps, common_steps_results], ignore_index=True, sort=False) '''if common steps were found, try to fit FD-Curve''' empty = { @@ -364,7 +363,8 @@ def start_subprocess(analysis_folder, timestamp, Files, input_settings, input_fo 1, derivative_array, F_low, - 0 + 0, + n ) fit.append(fit_ss) @@ -392,7 +392,8 @@ def start_subprocess(analysis_folder, timestamp, Files, input_settings, input_fo 1, derivative_array, F_low, - 0 + 0, + len(common_steps) ) fit.append(fit_ss) diff --git a/POTATO_GUI.py b/POTATO_GUI.py index aefa097..a4f41df 100644 --- a/POTATO_GUI.py +++ b/POTATO_GUI.py @@ -15,7 +15,6 @@ from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg, NavigationToolbar2Tk from matplotlib.figure import Figure from matplotlib.lines import Line2D -from tkinter import filedialog from tkinter import ttk from PIL import ImageTk, Image import pandas as pd @@ -27,8 +26,8 @@ import json # relative imports -from POTATO_ForceRamp import start_subprocess, read_in_data -from POTATO_preprocessing import preprocess_RAW, create_derivative +from POTATO_ForceRamp import start_subprocess, read_in_data, show_h5_structure +from POTATO_preprocessing import create_derivative from POTATO_config import default_values_HF, default_values_LF, default_values_CSV, default_values_FIT, default_values_constantF from POTATO_constantF import get_constantF, display_constantF, fit_constantF from POTATO_fitting import fitting_ds, fitting_ss @@ -53,7 +52,7 @@ def start_analysis(): input_settings, input_format, export_data, input_fitting, input_constantF = check_settings() # ask wich directory should be analysed - folder = filedialog.askdirectory() + folder = tk.filedialog.askdirectory() root.title('POTATO -- ' + str(folder)) # decide which input format was choosen @@ -113,6 +112,7 @@ def parameters(default_values, default_fit, default_constantF): Force_Min.set(default_values['Force threshold, pN']) Z_score_force.set(default_values['Z-score force']) Z_score_distance.set(default_values['Z-score distance']) + augment_factor_value.set(2) step_d_variable.set(str(default_values['Step d'])) window_size_variable.set(str(default_values['Moving median window size'])) @@ -169,13 +169,15 @@ def check_settings(): 'z-score_d': float(Z_score_distance2.get()), 'window_size': int(window_size_value.get()), 'data_frequency': float(Frequency_value.get()), - 'STD_diff': float(STD_difference_value.get()) + 'STD_diff': float(STD_difference_value.get()), + 'augment_factor': augment_factor_value.get() } input_format = { 'HF': check_box_HF.get(), 'LF': check_box_LF.get(), 'CSV': check_box_CSV.get(), + 'Augment': check_box_augment.get(), 'Trap': check_box_Trap1.get(), 'length_measure': check_box_um.get(), 'MultiH5': check_box_multiH5.get(), @@ -277,7 +279,7 @@ def get_single_file(format): pass input_settings, input_format, export_data, input_fitting, input_constantF = check_settings() - import_file_path = filedialog.askopenfilename() + import_file_path = tk.filedialog.askopenfilename() input_format['preprocess'] = 0 FD_raw, FD_raw_um, Frequency_value, filename = read_in_data(0, [import_file_path], input_settings, input_format) input_format['preprocess'] = 1 @@ -285,6 +287,20 @@ def get_single_file(format): display_RAW_FD(FD[:, 0], FD[:, 1], FD_raw[:, 0], FD_raw[:, 1], filename) +def show_h5(): + import_file_path = tk.filedialog.askopenfilename() + h5_structure = show_h5_structure(import_file_path) + h5_structure_window = tk.Toplevel(root) + h5_structure_window.title("H5 structure") + + text = tk.Text(h5_structure_window, height=50, width=200) + scroll_bar = tk.Scrollbar(h5_structure_window, command=text.yview) + scroll_bar.pack(side=tk.RIGHT, fill=tk.Y) + text['yscrollcommand'] = scroll_bar.set + text.pack(side=tk.LEFT, fill=tk.Y) + text.insert("end", h5_structure) + + # create the plot for tab2 def display_RAW_FD(processed_F, processed_D, raw_F, raw_D, filename): global figure_raw @@ -305,7 +321,7 @@ def display_RAW_FD(processed_F, processed_D, raw_F, raw_D, filename): subplot1.set_title(str(filename)) subplot1.set_xlabel("Distance (nm)") subplot1.set_ylabel("Force (pN)") - subplot1.plot(raw_D, raw_F, alpha=0.8, color='C0', zorder=0) + subplot1.scatter(raw_D, raw_F, alpha=0.8, color='C0', s=0.1, zorder=0) subplot1.scatter(processed_D, processed_F, marker='.', s=0.1, linewidths=None, alpha=1, color='C1', zorder=1) subplot1.legend(legend_elements, ['Downsampled FD-Data', 'Filtered FD-Data']) @@ -360,7 +376,7 @@ def open_folder(): input_settings, input_format, export_data, input_fitting, input_constantF = check_settings() # ask wich directory should be analysed - folder = filedialog.askdirectory() + folder = tk.filedialog.askdirectory() root.title('POTATO -- ' + str(folder)) # decide which input format was choosen @@ -774,7 +790,7 @@ def export_table(): 'Work [kT]' ]) - name = filedialog.asksaveasfile(mode='w', defaultextension=".csv") + name = tk.filedialog.asksaveasfile(mode='w', defaultextension=".csv") Fit_results.to_csv(name.name, index=False, header=True) @@ -818,6 +834,7 @@ def tab_bind(event=None): file_menu.add_command(label='Analyse folder (FD curves)', command=start_analysis) file_menu.add_command(label='Display single FD curve (h5)', command=lambda: get_single_file('h5')) file_menu.add_command(label='Display single FD curve (csv)', command=lambda: get_single_file('csv')) + file_menu.add_command(label='Show h5 file structure', command=lambda: show_h5()) file_menu.add_separator() file_menu.add_command(label='Display constant force', command=show_constantF) file_menu.add_command(label='Fit constant force', command=start_constantF) @@ -896,6 +913,7 @@ def select_box(*check_box): check_box_HF = tk.IntVar(value=1) check_box_LF = tk.IntVar() check_box_CSV = tk.IntVar() + check_box_augment = tk.IntVar() check_box_Trap1 = tk.IntVar() check_box_Trap2 = tk.IntVar(value=1) check_box_um = tk.IntVar(value=1) @@ -924,6 +942,13 @@ def select_box(*check_box): command=lambda: [select_box(check_box_CSV, check_box_HF, check_box_LF), parameters(default_values_CSV, default_values_FIT, default_values_constantF)] ).grid(row=2, column=0, sticky='W') + check_augment = tk.Checkbutton( + check_box, + text="Data Augmentation", + variable=check_box_augment, + command=lambda: show_augment() + ).grid(row=3, column=0, sticky='W') + check_Trap1 = tk.Checkbutton( check_box, text="Trap 1x", @@ -978,6 +1003,36 @@ def select_box(*check_box): Label_Zscore_F = tk.Label(parameter_frame, text='Z-score force') Label_Zscore_D = tk.Label(parameter_frame, text='Z-score distance') + + Cluster_augment = tk.Label(parameter_frame, text='AUGMENTATION', font='Helvetica 9 bold') + Label_augment_factor = tk.Label(parameter_frame, text='Augmentation factor') + augment_factor_value = tk.StringVar() + augment_factor_entry = tk.Entry(parameter_frame, textvariable=augment_factor_value) + + + def show_augment(): + global Cluster_augment + global Label_augment_factor + global augment_factor_value + global augment_factor_entry + + if check_box_augment.get() == 1: + augment_factor_value.set(2) + Cluster_augment.grid(row=8, column=0, padx=2, pady=(20, 2)) + Label_augment_factor.grid(row=9, column=0, sticky=tk.E + tk.W, padx=2, pady=2) + augment_factor_entry.grid(row=9, column=1, padx=2, pady=2) + + elif check_box_augment.get() == 0 and Cluster_augment and Label_augment_factor and augment_factor_entry: + Cluster_augment.destroy() + Label_augment_factor.destroy() + augment_factor_entry.destroy() + Cluster_augment = tk.Label(parameter_frame, text='AUGMENTATION', font='Helvetica 9 bold') + Label_augment_factor = tk.Label(parameter_frame, text='Augmentation factor') + augment_factor_value = tk.StringVar() + augment_factor_entry = tk.Entry(parameter_frame, textvariable=augment_factor_value) + + + downsample_value = tk.StringVar() downsample_value1 = tk.Entry(parameter_frame, textvariable=downsample_value) @@ -1027,7 +1082,7 @@ def select_box(*check_box): width=20 ) - BUTTON1.grid(row=9, column=0, columnspan=2, pady=125) + BUTTON1.grid(row=11, column=0, columnspan=2, pady=125) """organize tab2""" figure_frame2 = tk.Canvas(tab2, height=650, width=650, borderwidth=1, relief='ridge') diff --git a/POTATO_find_steps.py b/POTATO_find_steps.py index 5255827..4219e2e 100644 --- a/POTATO_find_steps.py +++ b/POTATO_find_steps.py @@ -40,6 +40,30 @@ def moving_median(input_data, column_number, window_size): # sorting the data based on a x times STD threshold (normal distibuted noise vs extreme values from steps) +# try to simplify this function +# def cut_off(input_array, column_number, mm, std, z_score): +# # convert mm to a numpy array +# mm = np.array(mm) + +# # calculate the upper and lower bounds +# upper_bound = mm + z_score * std +# lower_bound = mm - z_score * std + +# # create masks for the above, below, and inside regions +# above_mask = input_array[:, column_number] > upper_bound +# below_mask = input_array[:, column_number] < lower_bound +# inside_mask = ~(above_mask | below_mask) + +# # use the masks to select the corresponding rows from input_array +# Above = input_array[above_mask] +# Below = input_array[below_mask] +# Inside = input_array[inside_mask] + +# inside_indices = np.where(inside_mask)[0] + +# return Above, Inside, Below, inside_indices + + def cut_off(input_array, column_number, mm, std, z_score): # sort values - inside STD region, above STD region and below STD region F_values_inside = [] @@ -201,6 +225,7 @@ def find_steps_PD(input_settings, filename_i, Force_Distance, der_arr, orientati Above, Inside, Below, inside_indices_PD = cut_off(der_arr, 3, PD_mm, STD_1, input_settings['z-score_d']) n_runs = n_runs + 1 + if STD_1 < 0.05: STD_1 = 0.05 @@ -222,6 +247,7 @@ def find_steps_PD(input_settings, filename_i, Force_Distance, der_arr, orientati for i in range(len(PD_mm)): PD_mm2_STD2_positive.append(PD_mm[i] + input_settings['z-score_d'] * STD_1) PD_mm2_STD2_negative.append(PD_mm[i] - input_settings['z-score_d'] * STD_1) + # find the step points # for those steps that cross the 3*STD2 threshold -> find the closest 0 values prior/following to the crossing one diff --git a/POTATO_preprocessing.py b/POTATO_preprocessing.py index 1c1d07c..809b54d 100644 --- a/POTATO_preprocessing.py +++ b/POTATO_preprocessing.py @@ -4,18 +4,54 @@ import numpy as np -def preprocess_RAW(Force, Distance, input_settings): - # Downsample - Force_ds = Force[::input_settings['downsample_value']] - Distance_ds = Distance[::input_settings['downsample_value']] - - # Filter - b, a = signal.butter(input_settings['filter_degree'], input_settings['filter_cut_off']) - filteredForce = signal.filtfilt(b, a, Force_ds) - filteredDistance = signal.filtfilt(b, a, Distance_ds) - - Force_Distance = np.column_stack((filteredForce, filteredDistance * 1000)) - Force_Distance_um = np.column_stack((filteredForce, filteredDistance)) +def preprocess_RAW(Force, Distance, input_settings, input_format): + + if input_format['Augment'] == 1 and input_settings['augment_factor'] != '': + # Augment data + new_f = [] + new_d = [] + factor = int(input_settings['augment_factor']) + + for line in range(len(Force) - 1): + f = Force[line] + d = Distance[line] + new_f.append(f) + new_d.append(d) + f_next = Force[line + 1] + d_next = Distance[line + 1] + delta_f = abs(f_next - f) + delta_d = abs(d_next - d) + + for point in range(1, factor): + mu = 0 + sigma_f = (1.5 * delta_f) / 3 + sigma_d = (1.5 * delta_d) / 3 + nf = np.random.normal(mu, sigma_f) + nd = np.random.normal(mu, sigma_d) + new_point_f = f + ((point + nf) / factor) * delta_f + new_point_d = d + ((point + nd) / factor) * delta_d + new_f.append(new_point_f) + new_d.append(new_point_d) + + Force = np.array(new_f) + Distance = np.array(new_d) + + if input_format['preprocess'] == 1: + # Downsample + Force_ds = Force[::input_settings['downsample_value']] + Distance_ds = Distance[::input_settings['downsample_value']] + + # Filter + b, a = signal.butter(input_settings['filter_degree'], input_settings['filter_cut_off']) + filteredForce = signal.filtfilt(b, a, Force_ds) + filteredDistance = signal.filtfilt(b, a, Distance_ds) + + Force_Distance = np.column_stack((filteredForce, filteredDistance * 1000)) + Force_Distance_um = np.column_stack((filteredForce, filteredDistance)) + + else: + Force_Distance = np.column_stack((Force, Distance * 1000)) + Force_Distance_um = np.column_stack((Force, Distance)) return Force_Distance, Force_Distance_um