diff --git a/geoAnalytics.zip b/geoAnalytics.zip new file mode 100644 index 0000000..829dae3 Binary files /dev/null and b/geoAnalytics.zip differ diff --git a/geoAnalytics/__pycache__/repository.cpython-312.pyc b/geoAnalytics/__pycache__/repository.cpython-312.pyc index 0e2c147..be47f05 100644 Binary files a/geoAnalytics/__pycache__/repository.cpython-312.pyc and b/geoAnalytics/__pycache__/repository.cpython-312.pyc differ diff --git a/geoAnalytics/__pycache__/scoreCalculator.cpython-312.pyc b/geoAnalytics/__pycache__/scoreCalculator.cpython-312.pyc index 7cad266..e135a51 100644 Binary files a/geoAnalytics/__pycache__/scoreCalculator.cpython-312.pyc and b/geoAnalytics/__pycache__/scoreCalculator.cpython-312.pyc differ diff --git a/geoAnalytics/repository.py b/geoAnalytics/repository.py index a7d67f7..72663d1 100644 --- a/geoAnalytics/repository.py +++ b/geoAnalytics/repository.py @@ -328,6 +328,8 @@ def filter(self, filterFile): def calculate_scores_for_row(self, row): # Apply scoring only on relevant columns (from 3rd column onward) + # range(2, len(row)) is used to skip the first two columns (x and y) + # [j-2] is used to get the correct score object for the column because the first two columns are skipped aka x, y return [self.scores[j - 2].calculate_score(row[j]) for j in range(2, len(row))] @@ -344,6 +346,14 @@ def filtering(self, dataframe, filterFile): dataframe[[f'score_{i}' for i in range(score_columns.shape[1])]] = score_columns return dataframe + + def total_score(self, dataframe): + """ + Calculate the total score for the dataframe. + """ + # add a new column to the dataframe that contains the sum of all score columns divided by the number of score columns + num_score_cols = sum('score_' in col for col in dataframe.columns) + + dataframe['total_score'] = dataframe[[f'score_{i}' for i in range(1, num_score_cols)]].sum(axis=1) / num_score_cols - - + return dataframe diff --git a/geoAnalytics/scoreCalculator.py b/geoAnalytics/scoreCalculator.py index 3307cbd..b55e0b7 100644 --- a/geoAnalytics/scoreCalculator.py +++ b/geoAnalytics/scoreCalculator.py @@ -3,14 +3,18 @@ def __init__(self, min_val, max_val, avg_val): self.min_val = min_val self.max_val = max_val self.avg_val = avg_val - # Precompute the maximum possible distance from the average - self.max_distance = max(max_val - avg_val, avg_val - min_val) - + def calculate_score(self, value): - if value < self.min_val or value > self.max_val: + if value == self.avg_val: + return 0 + elif value == self.min_val or value == self.max_val: + return 0.5 + elif value > self.max_val or value < self.min_val: return 1 + elif value < self.avg_val: + # Linear interpolation between min_val and avg_val + return 0.5 * (1 - (value - self.min_val) / (self.avg_val - self.min_val)) else: - # Use the precomputed max distance - distance_from_avg = abs(value - self.avg_val) - score = distance_from_avg / self.max_distance - return score \ No newline at end of file + # Linear interpolation between avg_val and max_val + return 0.5 * (1 - (self.max_val - value) / (self.max_val - self.avg_val)) + diff --git a/geoAnalytics/testCase.ipynb b/geoAnalytics/testCase.ipynb index fe71946..797a551 100644 --- a/geoAnalytics/testCase.ipynb +++ b/geoAnalytics/testCase.ipynb @@ -77,18 +77,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "139f3edf", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tarunsreepada/Github/geoAnalytics/geoAnalytics/repository.py:303: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", - " df = pd.read_sql(sql, self.conn)\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -108,6 +100,14 @@ "\n", "[27000 rows x 5 columns]\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/tarunsreepada/Github/geoAnalytics/geoAnalytics/repository.py:303: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(sql, self.conn)\n" + ] } ], "source": [ @@ -117,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "d51ff67e", "metadata": {}, "outputs": [ @@ -125,7 +125,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/tarunsreepada/Github/geoAnalytics/geoAnalytics/repository.py:331: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + "/Users/tarunsreepada/Github/geoAnalytics/geoAnalytics/repository.py:333: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " return [self.scores[j - 2].calculate_score(row[j]) for j in range(2, len(row))]\n" ] }, @@ -168,9 +168,9 @@ "