From 2ce9fa466368a954cb23e6dadd6a186bae3dbda1 Mon Sep 17 00:00:00 2001
From: RAGE UDAY KIRAN <52146396+udayRage@users.noreply.github.com>
Date: Sat, 9 Nov 2024 16:37:03 +0900
Subject: [PATCH] #300 bug resolved
---
PAMI/extras/convert/DF2DB.py | 32 ++
.../dbStats/georeferencedTemporalDatabase.py | 457 ++++++++++++++++++
.../georeferencedTransactionalDatabase.py | 355 ++++++++++++++
3 files changed, 844 insertions(+)
create mode 100644 PAMI/extras/dbStats/georeferencedTemporalDatabase.py
create mode 100644 PAMI/extras/dbStats/georeferencedTransactionalDatabase.py
diff --git a/PAMI/extras/convert/DF2DB.py b/PAMI/extras/convert/DF2DB.py
index e3979ab7..c3abb2b1 100644
--- a/PAMI/extras/convert/DF2DB.py
+++ b/PAMI/extras/convert/DF2DB.py
@@ -142,6 +142,38 @@ def convert2UtilityDatabase(self, oFile: str) -> str:
self._endTime = time.time()
return self.DF2DB.getFileName()
+ def convert2geoReferencedTransactionalDatabase(self, oFile: str, condition: str, thresholdValue: Union[int, float]) -> str:
+ """
+ create transactional database and return oFileName
+ :param oFile: file name or path to store database
+ :type oFile: str
+ :return: oFile name
+ :rtype: str
+ """
+ self._startTime = time.time()
+ self.DF2DB.convert2TransactionalDatabase(oFile,condition,thresholdValue)
+ process = psutil.Process(os.getpid())
+ self._memoryUSS = process.memory_full_info().uss
+ self._memoryRSS = process.memory_info().rss
+ self._endTime = time.time()
+ return self.DF2DB.getFileName()
+
+ def convert2geoReferencedTemporalDatabase(self, oFile: str, condition: str, thresholdValue: Union[int, float]) -> str:
+ """
+ create temporal database and return oFile name
+ :param oFile: file name or path to store database
+ :type oFile: str
+ :return: oFile name
+ :rtype: str
+ """
+ self._startTime = time.time()
+ self.DF2DB.convert2TemporalDatabase(oFile,condition,thresholdValue)
+ process = psutil.Process(os.getpid())
+ self._memoryUSS = process.memory_full_info().uss
+ self._memoryRSS = process.memory_info().rss
+ self._endTime = time.time()
+ return self.DF2DB.getFileName()
+
def getMemoryUSS(self) -> float:
"""
Total amount of USS memory consumed by the mining process will be retrieved from this function
diff --git a/PAMI/extras/dbStats/georeferencedTemporalDatabase.py b/PAMI/extras/dbStats/georeferencedTemporalDatabase.py
new file mode 100644
index 00000000..7689b1db
--- /dev/null
+++ b/PAMI/extras/dbStats/georeferencedTemporalDatabase.py
@@ -0,0 +1,457 @@
+# TemporalDatabase is a class used to get stats of database.
+#
+# **Importing this algorithm into a python program**
+# --------------------------------------------------------
+#
+# from PAMI.extras.dbStats import TemporalDatabase as db
+#
+# obj = db.TemporalDatabase(iFile, "\t")
+#
+# obj.save(oFile)
+#
+# obj.run()
+#
+# obj.printStats()
+#
+
+
+
+
+__copyright__ = """
+Copyright (C) 2021 Rage Uday Kiran
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+"""
+
+import sys
+import statistics
+import pandas as pd
+import validators
+import numpy as np
+from urllib.request import urlopen
+from typing import Dict, Union
+import PAMI.extras.graph.plotLineGraphFromDictionary as plt
+
+class georeferencedTemporalDatabase:
+ """
+ :Description: TemporalDatabase is class to get stats of database.
+
+ :Attributes:
+
+ :param inputFile : file
+ input file path
+
+ :param sep : str
+ separator in file. Default is tab space.
+
+ :Methods:
+
+ run()
+ execute readDatabase function
+ readDatabase()
+ read database from input file
+ getDatabaseSize()
+ get the size of database
+ getMinimumTransactionLength()
+ get the minimum transaction length
+ getAverageTransactionLength()
+ get the average transaction length. It is sum of all transaction length divided by database length.
+ getMaximumTransactionLength()
+ get the maximum transaction length
+ getStandardDeviationTransactionLength()
+ get the standard deviation of transaction length
+ getSortedListOfItemFrequencies()
+ get sorted list of item frequencies
+ getSortedListOfTransactionLength()
+ get sorted list of transaction length
+ save(data, outputFile)
+ store data into outputFile
+ getMinimumPeriod()
+ get the minimum period
+ getAveragePeriod()
+ get the average period
+ getMaximumPeriod()
+ get the maximum period
+ getStandardDeviationPeriod()
+ get the standard deviation period
+ getNumberOfTransactionsPerTimestamp()
+ get number of transactions per time stamp. This time stamp range is 1 to max period.
+
+ **Importing this algorithm into a python program**
+ --------------------------------------------------------
+ .. code-block:: python
+
+ from PAMI.extras.dbStats import TemporalDatabase as db
+
+ obj = db.TemporalDatabase(iFile, "\t")
+
+ obj.save(oFile)
+
+ obj.run()
+
+ obj.printStats()
+ """
+
+ def __init__(self, inputFile: Union[str, pd.DataFrame], sep: str = '\t') -> None:
+ """
+ :param inputFile: input file name or path
+ :type inputFile: str
+ :param sep: separator
+ :type sep: str
+ :return: None
+ """
+ self.inputFile = inputFile
+ self.database = {}
+ self.lengthList = []
+ self.timeStampCount = {}
+ self.periodList = []
+ self.sep = sep
+ self.periods = {}
+
+ def run(self) -> None:
+ self.readDatabase()
+
+ def readDatabase(self) -> None:
+ """
+ read database from input file and store into database and size of each transaction.
+ And store the period between transactions as list
+ """
+ numberOfTransaction = 0
+ if isinstance(self.inputFile, pd.DataFrame):
+ if self.inputFile.empty:
+ print("its empty..")
+ i = self.inputFile.columns.values.tolist()
+ if 'TS' in i and 'Transactions' in i:
+ self.database = self.inputFile.set_index('ts').T.to_dict(orient='records')[0]
+ if 'TS' in i and 'Patterns' in i:
+ self.database = self.inputFile.set_index('ts').T.to_dict(orient='records')[0]
+ self.timeStampCount = self.inputFile.groupby('ts').count().T.to_dict(orient='records')[0]
+
+ if isinstance(self.inputFile, str):
+ if validators.url(self.inputFile):
+ data = urlopen(self.inputFile)
+ for line in data:
+ numberOfTransaction += 1
+ line.strip()
+ line = line.decode("utf-8")
+ temp = [i.rstrip() for i in line.split(self.sep)]
+ temp = [x for x in temp if x]
+ self.database[numberOfTransaction] = temp[1:]
+ self.timeStampCount[int(temp[0])] = self.timeStampCount.get(int(line[0]), 0)
+ self.timeStampCount[int(temp[0])] += 1
+ else:
+ try:
+ with open(self.inputFile, 'r', encoding='utf-8') as f:
+ for line in f:
+ numberOfTransaction += 1
+ line.strip()
+ temp = [i.rstrip() for i in line.split(self.sep)]
+ temp = [x for x in temp if x]
+ if len(temp) > 0:
+ self.database[numberOfTransaction] = temp[1:]
+ self.timeStampCount[int(temp[0])] = self.timeStampCount.get(int(line[0]), 0)
+ self.timeStampCount[int(temp[0])] += 1
+ except IOError:
+ print("File Not Found")
+ quit()
+ self.lengthList = [len(s) for s in self.database.values()]
+ timeStampList = sorted(list(self.database.keys()))
+ preTimeStamp = 0
+ for ts in timeStampList:
+ self.periodList.append(int(ts) - preTimeStamp)
+ preTimeStamp = ts
+
+ for x, y in self.database.items():
+ for i in y:
+ if i not in self.periods:
+ self.periods[i] = [x, x]
+ else:
+ self.periods[i][0] = max(self.periods[i][0], x - self.periods[i][1])
+ self.periods[i][1] = x
+ for key in self.periods:
+ self.periods[key][0] = max(self.periods[key][0], abs(len(self.database) - self.periods[key][1]))
+ self.periods = {k: v[0] for k, v in self.periods.items()}
+
+ def getDatabaseSize(self) -> int:
+ """
+ get the size of database
+ :return: dataset size
+ :rtype: int
+ """
+ return len(self.database)
+
+ def getMinimumTransactionLength(self) -> int:
+ """
+ get the minimum transaction length
+ :return: minimum transaction length
+ :rtype: int
+ """
+ return min(self.lengthList)
+
+ def getAverageTransactionLength(self) -> float:
+ """
+ get the average transaction length. It is sum of all transaction length divided by database length.
+ :return: average transaction length
+ :rtype: float
+ """
+ totalLength = sum(self.lengthList)
+ return totalLength / len(self.database)
+
+ def getMaximumTransactionLength(self) -> int:
+ """
+ get the maximum transaction length
+ :return: maximum transaction length
+ :rtype: int
+ """
+ return max(self.lengthList)
+
+ def getStandardDeviationTransactionLength(self) -> float:
+ """
+ get the standard deviation transaction length
+ :return: standard deviation transaction length
+ :rtype: float
+ """
+ return statistics.pstdev(self.lengthList)
+
+ def getVarianceTransactionLength(self) -> float:
+ """
+ get the variance transaction length
+ :return: variance transaction length
+ :rtype: float
+ """
+ return statistics.variance(self.lengthList)
+
+ def convertDataIntoMatrix(self) -> np.ndarray:
+ singleItems = self.getSortedListOfItemFrequencies()
+ itemsets = {}
+ for tid in self.database:
+ for item in singleItems:
+ if item in itemsets:
+ if item in self.database[tid]:
+ itemsets[item].append(1)
+ else:
+ itemsets[item].append(0)
+ else:
+ if item in self.database[tid]:
+ itemsets[item] = [1]
+ else:
+ itemsets[item] = [0]
+ data = list(itemsets.values())
+ an_array = np.array(data)
+ return an_array
+
+ def getSparsity(self) -> float:
+ """
+ get the sparsity of database. sparsity is percentage of 0 of database.
+ :return: database sparsity
+ :rtype: float
+ """
+ big_array = self.convertDataIntoMatrix()
+ n_zeros = np.count_nonzero(big_array == 0)
+ return (n_zeros / big_array.size)
+
+ def getDensity(self) -> float:
+ """
+ get the sparsity of database. sparsity is percentage of 0 of database.
+ :return: database sparsity
+ :rtype: float
+ """
+ big_array = self.convertDataIntoMatrix()
+ n_zeros = np.count_nonzero(big_array == 1)
+ return (1.0 - n_zeros / big_array.size)
+
+ def getTotalNumberOfItems(self) -> int:
+ """
+ get the number of items in database.
+ :return: number of items
+ :rtype: int
+ """
+ return len(self.getSortedListOfItemFrequencies())
+
+ def getSortedListOfItemFrequencies(self) -> Dict[str, int]:
+ """
+ get sorted list of item frequencies
+ :return: item frequencies
+ :rtype: dict
+ """
+ itemFrequencies = {}
+ for tid in self.database:
+ for item in self.database[tid]:
+ itemFrequencies[item] = itemFrequencies.get(item, 0)
+ itemFrequencies[item] += 1
+ return {k: v for k, v in sorted(itemFrequencies.items(), key=lambda x: x[1], reverse=True)}
+
+ def getFrequenciesInRange(self) -> Dict[int, int]:
+ fre = self.getSortedListOfItemFrequencies()
+ rangeFrequencies = {}
+ maximum = max([i for i in fre.values()])
+ values = [int(i * maximum / 6) for i in range(1, 6)]
+ # print(maximum)
+ va = len({key: val for key, val in fre.items() if val > 0 and val < values[0]})
+ rangeFrequencies[va] = values[0]
+ for i in range(1, len(values)):
+ va = len({key: val for key, val in fre.items() if val < values[i] and val > values[i - 1]})
+ rangeFrequencies[va] = values[i]
+ return rangeFrequencies
+
+ def getPeriodsInRange(self) -> Dict[int, int]:
+ fre = {k: v for k, v in sorted(self.periods.items(), key=lambda x: x[1])}
+ rangePeriods = {}
+ maximum = max([i for i in fre.values()])
+ values = [int(i * maximum / 6) for i in range(1, 6)]
+ # print(maximum)
+ va = len({key: val for key, val in fre.items() if val > 0 and val < values[0]})
+ rangePeriods[va] = values[0]
+ for i in range(1, len(values)):
+ va = len({key: val for key, val in fre.items() if val < values[i] and val > values[i - 1]})
+ rangePeriods[va] = values[i]
+ return rangePeriods
+
+ def getTransanctionalLengthDistribution(self) -> Dict[int, int]:
+ """
+ get transaction length
+ :return: transactional length
+ :rtype: dict
+ """
+ transactionLength = {}
+ for length in self.lengthList:
+ transactionLength[length] = transactionLength.get(length, 0)
+ transactionLength[length] += 1
+ return {k: v for k, v in sorted(transactionLength.items(), key=lambda x: x[0])}
+
+ def save(self, data: dict, outputFile: str) -> None:
+ """
+ store data into outputFile
+ :param data: input data
+ :type data: dict
+ :param outputFile: output file name or path to store
+ :type outputFile: str
+ :return: None
+ """
+ with open(outputFile, 'w') as f:
+ for key, value in data.items():
+ f.write(f'{key}\t{value}\n')
+
+ def getMinimumInterArrivalPeriod(self) -> int:
+ """
+ get the minimum inter arrival period
+ :return: minimum inter arrival period
+ :rtype: int
+ """
+ return min(self.periodList)
+
+ def getAverageInterArrivalPeriod(self) -> float:
+ """
+ get the average inter arrival period. It is sum of all period divided by number of period.
+ :return: average inter arrival period
+ :rtype: float
+ """
+ totalPeriod = sum(self.periodList)
+ return totalPeriod / len(self.periodList)
+
+ def getMaximumInterArrivalPeriod(self) -> int:
+ """
+ get the maximum inter arrival period
+ :return: maximum inter arrival period
+ :rtype: int
+ """
+ return max(self.periodList)
+
+ def getMinimumPeriodOfItem(self) -> int:
+ """
+ get the minimum period of the item
+ :return: minimum period
+ :rtype: int
+ """
+ return min([i for i in self.periods.values()])
+
+ def getAveragePeriodOfItem(self) -> float:
+ """
+ get the average period of the item
+ :return: average period
+ :rtype: float
+ """
+ return sum([i for i in self.periods.values()]) / len(self.periods)
+
+ def getMaximumPeriodOfItem(self) -> int:
+ """
+ get the maximum period of the item
+ :return: maximum period
+ :rtype: int
+ """
+ return max([i for i in self.periods.values()])
+
+ def getStandardDeviationPeriod(self) -> float:
+ """
+ get the standard deviation period
+ :return: standard deviation period
+ :rtype: float
+ """
+ return statistics.pstdev(self.periodList)
+
+ def getNumberOfTransactionsPerTimestamp(self) -> Dict[int, int]:
+ """
+ get number of transactions per time stamp
+ :return: number of transactions per time stamp as dict
+ :rtype: dict
+ """
+ maxTS = max(list(self.timeStampCount.keys()))
+ return {ts: self.timeStampCount.get(ts, 0) for ts in range(1, maxTS + 1)}
+
+ def printStats(self) -> None:
+ print(f'Database size : {self.getDatabaseSize()}')
+ print(f'Number of items : {self.getTotalNumberOfItems()}')
+ print(f'Minimum Transaction Size : {self.getMinimumTransactionLength()}')
+ print(f'Average Transaction Size : {self.getAverageTransactionLength()}')
+ print(f'Maximum Transaction Size : {self.getMaximumTransactionLength()}')
+ print(f'Minimum Inter Arrival Period : {self.getMinimumInterArrivalPeriod()}')
+ print(f'Average Inter Arrival Period : {self.getAverageInterArrivalPeriod()}')
+ print(f'Maximum Inter Arrival Period : {self.getMaximumInterArrivalPeriod()}')
+ print(f'Minimum periodicity : {self.getMinimumPeriodOfItem()}')
+ print(f'Average periodicity : {self.getAveragePeriodOfItem()}')
+ print(f'Maximum periodicicty : {self.getMaximumPeriodOfItem()}')
+ print(f'Standard Deviation Transaction Size : {self.getStandardDeviationTransactionLength()}')
+ print(f'Variance : {self.getVarianceTransactionLength()}')
+ print(f'Sparsity : {self.getSparsity()}')
+
+ def plotGraphs(self) -> None:
+ itemFrequencies = self.getFrequenciesInRange()
+ transactionLength = self.getTransanctionalLengthDistribution()
+ plt.plotLineGraphFromDictionary(itemFrequencies, 100, 0, 'Frequency', 'no of items', 'frequency')
+ plt.plotLineGraphFromDictionary(transactionLength, 100, 0, 'transaction length', 'transaction length',
+ 'frequency')
+
+
+if __name__ == '__main__':
+ data = {'tid': [1, 2, 3, 4, 5, 6, 7],
+
+ 'Transactions': [['a', 'd', 'e'], ['b', 'a', 'f', 'g', 'h'], ['b', 'a', 'd', 'f'], ['b', 'a', 'c'],
+ ['a', 'd', 'g', 'k'],
+
+ ['b', 'd', 'g', 'c', 'i'], ['b', 'd', 'g', 'e', 'j']]}
+
+ # data = pd.DataFrame.from_dict('temporal_T10I4D100K.csv')
+
+
+ if len(sys.argv) < 3:
+ print("Please provide two arguments.")
+ else:
+ obj = georeferencedTemporalDatabase(sys.argv[1], sys.argv[2])
+ obj1 = georeferencedTemporalDatabase(pd.DataFrame(data))
+ obj1.run()
+ if obj1.getDatabaseSize() > 0:
+ obj1.printStats()
+ obj1.plotGraphs()
+ else:
+ print("No data found in the database.")
diff --git a/PAMI/extras/dbStats/georeferencedTransactionalDatabase.py b/PAMI/extras/dbStats/georeferencedTransactionalDatabase.py
new file mode 100644
index 00000000..4d44009f
--- /dev/null
+++ b/PAMI/extras/dbStats/georeferencedTransactionalDatabase.py
@@ -0,0 +1,355 @@
+# Transactional Database is a class used to get stats of database.
+#
+# **Importing this algorithm into a python program**
+# --------------------------------------------------------
+#
+# from PAMI.extras.dbStats import TransactionalDatabase as db
+#
+# obj = db.TransactionalDatabase(iFile, "\t")
+#
+# obj.save(oFile)
+#
+# obj.run()
+#
+# obj.printStats()
+#
+
+
+__copyright__ = """
+Copyright (C) 2021 Rage Uday Kiran
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see .
+"""
+
+import sys
+import statistics
+import pandas as pd
+import validators
+import numpy as np
+from urllib.request import urlopen
+from typing import List, Dict, Tuple, Set, Union, Any, Generator
+import PAMI.extras.graph.plotLineGraphFromDictionary as plt
+
+
+class georeferencedTransactionalDatabase:
+ """
+ :Description: TransactionalDatabase is class to get stats of database.
+
+ :Attributes:
+
+ :param inputFile: file :
+ input file path
+ :param sep: str
+ separator in file. Default is tab space.
+
+ :Methods:
+
+ run()
+ execute readDatabase function
+ readDatabase()
+ read database from input file
+ getDatabaseSize()
+ get the size of database
+ getMinimumTransactionLength()
+ get the minimum transaction length
+ getAverageTransactionLength()
+ get the average transaction length. It is sum of all transaction length divided by database length.
+ getMaximumTransactionLength()
+ get the maximum transaction length
+ getStandardDeviationTransactionLength()
+ get the standard deviation of transaction length
+ getSortedListOfItemFrequencies()
+ get sorted list of item frequencies
+ getSortedListOfTransactionLength()
+ get sorted list of transaction length
+ save(data, outputFile)
+ store data into outputFile
+ getMinimumPeriod()
+ get the minimum period
+ getAveragePeriod()
+ get the average period
+ getMaximumPeriod()
+ get the maximum period
+ getStandardDeviationPeriod()
+ get the standard deviation period
+ getNumberOfTransactionsPerTimestamp()
+ get number of transactions per time stamp. This time stamp range is 1 to max period.
+
+ **Importing this algorithm into a python program**
+ --------------------------------------------------------
+ .. code-block:: python
+
+ from PAMI.extras.dbStats import TransactionalDatabase as db
+
+ obj = db.TransactionalDatabase(iFile, "\t")
+
+ obj.save(oFile)
+
+ obj.run()
+
+ obj.printStats()
+
+ """
+
+ def __init__(self, inputFile: Union[str, pd.DataFrame], sep: str = '\t') -> None:
+ """
+ :param inputFile: input file name or path
+ :type inputFile: str
+ :param sep: separator
+ :type sep: str
+ :return: None
+ """
+ self.inputFile = inputFile
+ self.lengthList = []
+ self.sep = sep
+ self.database = {}
+ self.itemFrequencies = {}
+
+ def run(self) -> None:
+ self.readDatabase()
+
+ def readDatabase(self) -> None:
+ """
+ read database from input file and store into database and size of each transaction.
+ """
+ # self.creatingItemSets()
+ numberOfTransaction = 0
+ if isinstance(self.inputFile, pd.DataFrame):
+ if self.inputFile.empty:
+ print("its empty..")
+ i = self.inputFile.columns.values.tolist()
+ if 'tid' in i and 'Transactions' in i:
+ self.database = self.inputFile.set_index('tid').T.to_dict(orient='records')[0]
+ if 'tid' in i and 'Patterns' in i:
+ self.database = self.inputFile.set_index('tid').T.to_dict(orient='records')[0]
+ if isinstance(self.inputFile, str):
+ if validators.url(self.inputFile):
+ data = urlopen(self.inputFile)
+ for line in data:
+ numberOfTransaction += 1
+ line.strip()
+ line = line.decode("utf-8")
+ temp = [i.rstrip() for i in line.split(self.sep)]
+ temp = [x for x in temp if x]
+ self.database[numberOfTransaction] = temp
+ else:
+ try:
+ with open(self.inputFile, 'r', encoding='utf-8') as f:
+ for line in f:
+ numberOfTransaction += 1
+ line.strip()
+ temp = [i.rstrip() for i in line.split(self.sep)]
+ temp = [x for x in temp if x]
+ self.database[numberOfTransaction] = temp
+ except IOError:
+ print("File Not Found")
+ quit()
+ self.lengthList = [len(s) for s in self.database.values()]
+
+ def getDatabaseSize(self) -> int:
+ """
+ get the size of database
+ :return: dataset size
+ :rtype: int
+ """
+ return len(self.database)
+
+ def getTotalNumberOfItems(self) -> int:
+ """
+ get the number of items in database.
+ :return: number of items
+ :rtype: int
+ """
+ return len(self.getSortedListOfItemFrequencies())
+
+ def getMinimumTransactionLength(self) -> int:
+ """
+ get the minimum transaction length
+ :return: minimum transaction length
+ :rtype: int
+ """
+ return min(self.lengthList)
+
+ def getAverageTransactionLength(self) -> float:
+ """
+ get the average transaction length. It is sum of all transaction length divided by database length.
+ :return: average transaction length
+ :rtype: float
+ """
+ totalLength = sum(self.lengthList)
+ return totalLength / len(self.database)
+
+ def getMaximumTransactionLength(self) -> int:
+ """
+ get the maximum transaction length
+ :return: maximum transaction length
+ :rtype: int
+ """
+ return max(self.lengthList)
+
+ def getStandardDeviationTransactionLength(self) -> float:
+ """
+ get the standard deviation transaction length
+ :return: standard deviation transaction length
+ :rtype: float
+ """
+ return statistics.pstdev(self.lengthList)
+
+ def getVarianceTransactionLength(self) -> float:
+ """
+ get the variance transaction length
+ :return: variance transaction length
+ :rtype: float
+ """
+ return statistics.variance(self.lengthList)
+
+ def getNumberOfItems(self) -> int:
+ """
+ get the number of items in database.
+ :return: number of items
+ :rtype: int
+ """
+ return len(self.getSortedListOfItemFrequencies())
+
+ def convertDataIntoMatrix(self) -> np.ndarray:
+ singleItems = self.getSortedListOfItemFrequencies()
+ # big_array = np.zeros((self.getDatabaseSize(), len(self.getSortedListOfItemFrequencies())))
+ itemsets = {}
+ for i in self.database:
+ for item in singleItems:
+ if item in itemsets:
+ if item in self.database[i]:
+ itemsets[item].append(1)
+ else:
+ itemsets[item].append(0)
+ else:
+ if item in self.database[i]:
+ itemsets[item] = [1]
+ else:
+ itemsets[item] = [0]
+ # new = pd.DataFrame.from_dict(itemsets)
+ data = list(itemsets.values())
+ an_array = np.array(data)
+ return an_array
+
+ def getSparsity(self) -> float:
+ """
+ get the sparsity of database. sparsity is percentage of 0 of database.
+ :return: database sparsity
+ :rtype: float
+ """
+ big_array = self.convertDataIntoMatrix()
+ n_zeros = np.count_nonzero(big_array == 0)
+ return (n_zeros / big_array.size)
+
+ def getDensity(self) -> float:
+ """
+ get the sparsity of database. sparsity is percentage of 0 of database.
+ :return: database sparsity
+ :rtype: float
+ """
+ big_array = self.convertDataIntoMatrix()
+ n_zeros = np.count_nonzero(big_array != 0)
+ return (n_zeros / big_array.size)
+
+ def getSortedListOfItemFrequencies(self) -> dict:
+ """
+ get sorted list of item frequencies
+ :return: item frequencies
+ :rtype: dict
+ """
+ itemFrequencies = {}
+ for tid in self.database:
+ for item in self.database[tid]:
+ itemFrequencies[item] = itemFrequencies.get(item, 0)
+ itemFrequencies[item] += 1
+ self.itemFrequencies = {k: v for k, v in sorted(itemFrequencies.items(), key=lambda x: x[1], reverse=True)}
+ return self.itemFrequencies
+
+ def getFrequenciesInRange(self) -> dict:
+ fre = self.getSortedListOfItemFrequencies()
+ rangeFrequencies = {}
+ maximum = max([i for i in fre.values()])
+ values = [int(i * maximum / 6) for i in range(1, 6)]
+ va = len({key: val for key, val in fre.items() if val > 0 and val < values[0]})
+ rangeFrequencies[va] = values[0]
+ for i in range(1, len(values)):
+ va = len({key: val for key, val in fre.items() if val < values[i] and val > values[i - 1]})
+ rangeFrequencies[va] = values[i]
+ return rangeFrequencies
+
+ def getTransanctionalLengthDistribution(self) -> dict:
+ """
+ Get transaction length
+ :return: a dictionary with transaction length as keys and their total length as values
+ :rtype: dict
+ """
+ transactionLength = {}
+ for length in self.lengthList:
+ transactionLength[length] = transactionLength.get(length, 0)
+ transactionLength[length] += 1
+ return {k: v for k, v in sorted(transactionLength.items(), key=lambda x: x[0])}
+
+ def save(self, data: dict, outputFile: str) -> None:
+ """
+ store data into outputFile
+ :param data: input data
+ :type data: dict
+ :param outputFile: output file name or path to store
+ :type outputFile: str
+ :return: None
+ """
+ with open(outputFile, 'w') as f:
+ for key, value in data.items():
+ f.write(f'{key}\t{value}\n')
+
+ def printStats(self) -> None:
+ print(f'Database size (total no of transactions) : {self.getDatabaseSize()}')
+ print(f'Number of items : {self.getNumberOfItems()}')
+ print(f'Minimum Transaction Size : {self.getMinimumTransactionLength()}')
+ print(f'Average Transaction Size : {self.getAverageTransactionLength()}')
+ print(f'Maximum Transaction Size : {self.getMaximumTransactionLength()}')
+ print(f'Standard Deviation Transaction Size : {self.getStandardDeviationTransactionLength()}')
+ print(f'Variance in Transaction Sizes : {self.getVarianceTransactionLength()}')
+ print(f'Sparsity : {self.getSparsity()}')
+
+ def plotGraphs(self) -> None:
+ # itemFrequencies = self.getFrequenciesInRange()
+ transactionLength = self.getTransanctionalLengthDistribution()
+ plt.plotLineGraphFromDictionary(self.itemFrequencies, 100, 0, 'Frequency', 'No of items', 'frequency')
+ plt.plotLineGraphFromDictionary(transactionLength, 100, 0, 'transaction length', 'transaction length',
+ 'frequency')
+
+
+if __name__ == '__main__':
+ data = {'tid': [1, 2, 3, 4, 5, 6, 7],
+
+ 'Transactions': [['a', 'd', 'e'], ['b', 'a', 'f', 'g', 'h'], ['b', 'a', 'd', 'f'], ['b', 'a', 'c'],
+ ['a', 'd', 'g', 'k'],
+
+ ['b', 'd', 'g', 'c', 'i'], ['b', 'd', 'g', 'e', 'j']]}
+
+ # data = pd.DataFrame.from_dict('transactional_T10I4D100K.csv')
+ import PAMI.extras.graph.plotLineGraphFromDictionary as plt
+ import pandas as pd
+
+ # obj = TransactionalDatabase(data)
+ obj = TransactionalDatabase(sys.argv[1], sys.argv[2])
+ obj = TransactionalDatabase(pd.DataFrame(data))
+ obj.run()
+ obj.printStats()
+ obj.plotGraphs()
+
+
+