Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
LasyaPalla committed Feb 3, 2025
2 parents 62cbe17 + 8422596 commit 0cfe54b
Show file tree
Hide file tree
Showing 20 changed files with 12,622 additions and 113 deletions.
243 changes: 243 additions & 0 deletions PAMI/extras/syntheticDataGenerator/GeoReferentialSequentialDatabase.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
# generateSequentialDatabase is a code used to generate sequential database.
#
# **Importing this algorithm into a python program**
# --------------------------------------------------------
# from PAMI.extras.generateDatabase import generateSequentialDatabase as db
# obj = db(10,10, 5, 10)
# obj.create()
# obj.save('db.txt')
# print(obj.getTransactions()) to get the transactional database as a pandas dataframe

# **Running the code from the command line**
# --------------------------------------------------------
# python generateDatabase.py 10 5 10 db.txt
# cat db.txt
#


__copyright__ = """
Copyright (C) 2024 Rage Uday Kiran
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
"""

import math

import numpy as np
import pandas as pd
import sys


class GeoReferentialSequentialDatabase:
"""
:Description Generate a sequential database with the given number of lines, average number of items per line, and total number of items
:Attributes:
numSeq: int
- number of sequences in database
avgItemsetPerSeq:int
- avarage number of itemset in one sequence
avgItemsPeritemset: int
- average number of items per itemset
numItems: int
- total kind of items
maxItem: int(default:numItems)
- maximum number of items per itemset
maxItemset: int(default:avgItemsetPerSeq * 2)
- maximum number of itemset per sequence
seqSep: str
- Separator for each item set
:Methods:
create:
Generate the transactional database
save:
Save the sequential database to a file
getTransactions:
Get the sequential database
"""

def __init__(self, numSeq, avgItemsetPerSeq, avgItemsPerItemset, numItems,x1=0,y1=0,x2=100,y2=100, maxItem=0, maxItemset=0,
seqSep="-1") -> None:
"""
Initialize the transactional database with the given parameters
"""

self.numSeq = numSeq
self.avgItemsetPerSeq = avgItemsetPerSeq
self.avgItemsPerItemset = avgItemsPerItemset
self.numItems = numItems
if maxItem == 0:
self.maxItem = numItems
else:
self.maxItem = maxItem
if maxItemset == 0:
self.maxItemset = avgItemsetPerSeq * 2
else:
self.maxItemset = maxItemset
self.seqSep = seqSep
self.db = []
numPoints = (x2 - x1) * (y2 - y1)
if numItems > numPoints:
raise ValueError("Number of points is less than the number of lines * average items per line")

self.itemPoint = {}


for i in (range(1, numItems + 1)):
# self.itemPoint[i] = (np.random.randint(x1, x2), np.random.randint(y1, y2))
point = self.getPoint(x1, y1, x2, y2)
while point in self.itemPoint:
point = self.getPoint(x1, y1, x2, y2)
self.itemPoint[i] = point

def tuning(self, array, sumRes) -> list:
"""
Tune the array so that the sum of the values is equal to sumRes
:param array: list of values
:type array: list
:param sumRes: the sum of the values in the array to be tuned
:type sumRes: int
:return: list of values with the tuned values and the sum of the values in the array to be tuned and sumRes is equal to sumRes
:rtype: list
"""

while np.sum(array) != sumRes:
# get index of largest value
randIndex = np.random.randint(0, len(array))
# if sum is too large, decrease the largest value
if np.sum(array) > sumRes:
array[randIndex] -= 1
# if sum is too small, increase the smallest value
else:
minIndex = np.argmin(array)
array[randIndex] += 1
return array

def generateArray(self, nums, avg, maxItems) -> list:
"""
Generate a random array of length nums whose values average to avg
:param nums: number of values
:type nums: list
:param avg: average value
:type avg: float
:param maxItems: maximum value
:type maxItems: int
:return: random array
:rtype: list
"""

# generate n random values
values = np.random.randint(1, maxItems, nums)
sumRes = nums * avg

values = self.tuning(values, sumRes)

# if any value is less than 1, increase it and tune the array again
while np.any(values < 1):
for i in range(nums):
if values[i] < 1:
values[i] += 1
values = self.tuning(values, sumRes)

while np.any(values > maxItems):
for i in range(nums):
if values[i] > maxItems:
values[i] -= 1
values = self.tuning(values, sumRes)

# if all values are same then randomly increase one value and decrease another
while np.all(values == values[0]):
values[np.random.randint(0, nums)] += 1
values = self.tuning(values, sumRes)

return values

def create(self, item="") -> None:
"""
:param item: list (default:generate random numItems items)
item list to make database
Generate the sequential database
:return: None
"""
if item == "":
item=self.itemPoint

db = set()
sequences = self.generateArray(self.numSeq, self.avgItemsetPerSeq - 1, self.maxItemset)

for numItemset in sequences:
seq = []
values = self.generateArray(numItemset + 1, self.avgItemsPerItemset, self.maxItem)

for value in values:
line = list(set(np.random.choice(item, value, replace=False)))
seq = seq + line
seq = seq + [self.seqSep]
seq.pop()

self.db.append(seq)

def save(self, filename, sep="\t") -> None:
"""
Save the transactional database to a file
:param filename: name of the file
:type filename: str
:return: None
"""

with open(filename, 'w') as f:
for line in self.db:
f.write(sep.join(map(str, line)) + '\n')

def getSequence(self) -> pd.DataFrame:
"""
Get the sequential database
:return: the sequential database
:rtype: pd.DataFrame
"""
df = pd.DataFrame(self.db)
return df


if __name__ == "__main__":
# test the class
db = GeoReferentialSequentialDatabase(10, 5, 5, 10)
db.create()
db.save('db.txt')
print(db.getTransactions())
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import tqdm
import pandas as pd


class GeoReferentialTemporalDatabase:
"""
This class create synthetic geo-referential temporal database.
Expand Down Expand Up @@ -57,6 +58,7 @@ def __init__(
self.seperator = sep
self.occurrenceProbabilityOfSameTimestamp = occurrenceProbabilityOfSameTimestamp
self.occurrenceProbabilityToSkipSubsequentTimestamp = occurrenceProbabilityToSkipSubsequentTimestamp
self.current_timestamp=int()
self._startTime = float()
self._endTime = float()
self._memoryUSS = float()
Expand All @@ -76,7 +78,7 @@ def __init__(

def getPoint(self, x1, y1, x2, y2):

return (np.random.randint(x1, x2), np.random.randint(y1, y2))
return (np.random.randint(x1, x2),np.random.randint(y1, y2))

def performCoinFlip(self, probability: float) -> bool:
"""
Expand All @@ -86,7 +88,7 @@ def performCoinFlip(self, probability: float) -> bool:
:return: True if the coin lands heads, False otherwise.
"""
result = np.random.choice([0, 1], p=[1 - probability, probability])
return result == 1
return result

def tuning(self, array, sumRes) -> np.ndarray:
"""
Expand All @@ -106,15 +108,14 @@ def tuning(self, array, sumRes) -> np.ndarray:
"""

while np.sum(array) != sumRes:
# get index of largest value
randIndex = np.random.randint(0, len(array))
# if sum is too large, decrease the largest value
if np.sum(array) > sumRes:
array[randIndex] -= 1
maxIndex = np.argmax(array)
array[maxIndex] -= 1
# if sum is too small, increase the smallest value
else:
minIndex = np.argmin(array)
array[randIndex] += 1
array[minIndex] += 1
return array

def generateArray(self, nums, avg, maxItems) -> np.ndarray:
Expand All @@ -139,7 +140,7 @@ def generateArray(self, nums, avg, maxItems) -> np.ndarray:
"""

# generate n random values
values = np.random.randint(1, maxItems, nums)
values = np.random.randint(1, avg*1.5, nums)

sumRes = nums * avg

Expand Down Expand Up @@ -172,39 +173,32 @@ def create(self) -> None:
"""
self._startTime = time.time()
db = set()
lineSize = [] #may be error. need to check it.
sumRes = self.databaseSize * self.avgItemsPerTransaction # Total number of items

values = self.generateArray(self.databaseSize, self.avgItemsPerTransaction, self.numItems)

for i in range(self.databaseSize):
# Determine the timestamp
if self.performCoinFlip(self.occurrenceProbabilityOfSameTimestamp):
timestamp = self.current_timestamp
else:
if self.performCoinFlip(self.occurrenceProbabilityToSkipSubsequentTimestamp):
if self.performCoinFlip(self.occurrenceProbabilityToSkipSubsequentTimestamp)==1:
self.current_timestamp += 2
else:
self.current_timestamp += 1
timestamp = self.current_timestamp

self.db.append([timestamp]) # Start the transaction with the timestamp

lineSize.append([i, 0]) # Initialize lineSize with 0 for each transaction

# Adjust lineSize to ensure sum of sizes equals sumRes
lineSize = self.tuning(lineSize, sumRes)


# For each transaction, generate items
for i in tqdm.tqdm(range(len(lineSize))):
transaction_index = lineSize[i][0]
num_items = lineSize[i][1]
for i in tqdm.tqdm(range(self.databaseSize)):

if num_items > self.numItems:
raise ValueError(
"Error: Either increase numItems or decrease avgItemsPerTransaction or modify percentage")
items = np.random.choice(range(1, self.numItems + 1), num_items, replace=False)
self.db[transaction_index].extend(items)
items = np.random.choice(range(1, self.numItems + 1), values[i], replace=False)
nline = [self.itemPoint[i] for i in items]
self.db[i].extend(nline)

self._runTime = time.time() - self._startTime
self._endTime = time.time()
process = psutil.Process(os.getpid())
self._memoryUSS = process.memory_full_info().uss
self._memoryRSS = process.memory_info().rss
Expand Down
Loading

0 comments on commit 0cfe54b

Please sign in to comment.