-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathLSH.R
99 lines (82 loc) · 2.72 KB
/
LSH.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
#Create MonetDB Connection
library(MonetDB.R)
library(DBI)
library(digest)
library(dplyr)
library(data.table)
# LSH Packages
library(LSHR)
library(Matrix)
library(doParallel)
library(mltools)
######## Fetch Data from DB ########
conn <- dbConnect(MonetDB.R(), host="localhost", dbname="demo", user="monetdb", password="monetdb")
diabetes_binary <- dbGetQuery(conn,"SELECT * FROM (SELECT *,
ROW_NUMBER() OVER (ORDER BY encounterid ASC) AS rownumber
FROM MIMICIII.DIABETES_BINARY) AS chunks")
dbDisconnect(conn)
# Transpose Data
diabetes.transpose <- setNames(data.frame(t(diabetes_binary[,-1])), diabetes_binary[,1])
######## Minhashing - Create a Signature Matrix ########
minHash <- function(diabetes.transpose, list_hashfct){
sigMatrix <- data.frame(matrix(nrow=30,ncol=100097)) #matrix(,nrow=3000,ncol=100097)
hashValue <- 0
h_i <- 0
# HashFunctions #
hashFctCreator<-function(){
a <- sample(1:1000, 1)
b <- sample(1:1000, 1)
p <- sample(1:1000, 1)
func <- paste(a,"* x +",b,"%%",p)
return(func)
}
list_hashfct <- vector("list", 30)
for(i in 1:30){
list_hashfct[i]<- hashFctCreator()
}
print("FUNCTIONS DONE")
start_time <- Sys.time()
total <- nrow(diabetes.transpose)
#pb <- winProgressBar(title = "Minhashing Progress", min = 0,
# max = total, width = 300)
num_core <- detectCores() - 1
cl <- makeCluster(num_core, type='PSOCK')
registerDoParallel(cl)
# Parallalization in Distance Calculation
parLapply(cl, 1:nrow(diabetes.transpose), function(rowID){
#for(rowID in 1:nrow(diabetes.transpose)){
#setWinProgressBar(pb, rowID, title=paste(round(rowID/total*100, 0),"% done"))
for(columnID in 1:ncol(diabetes.transpose)){
if(diabetes.transpose[rowID, columnID] == 1){
for(i in 1:30){
h_i <- eval(bquote(function(x).(parse(text = list_hashfct[i])[[1]])))
hashValue <- h_i(rowID)
if(hashValue < sigMatrix[i,columnID] || sigMatrix[i,columnID] %in% NA){
sigMatrix[i,columnID] <- unlist(hashValue)
}
}
}
}
}
)
stopCluster(cl)
end_time <- Sys.time()
time = end_time - start_time
print(time)
return(sigMatrix)
}
########## MAIN #########
signifMatrix <- minHash(diabetes.transpose)
########## LSH #######
lsh <- function(sigMatrix, bands, rows){
for(i in seq(1, ncol(signifMatrix), bands)){
for(band in i+1:i+bands){
current <- signifMatrix[i,band]
for(row in 1:rows){
if(row == row){
candidatepair(id1, id2)
}
}
}
}
}