diff --git a/geoAnalytics.zip b/geoAnalytics.zip new file mode 100644 index 0000000..829dae3 Binary files /dev/null and b/geoAnalytics.zip differ diff --git a/geoAnalytics/__pycache__/repository.cpython-312.pyc b/geoAnalytics/__pycache__/repository.cpython-312.pyc index 0e2c147..be47f05 100644 Binary files a/geoAnalytics/__pycache__/repository.cpython-312.pyc and b/geoAnalytics/__pycache__/repository.cpython-312.pyc differ diff --git a/geoAnalytics/__pycache__/scoreCalculator.cpython-312.pyc b/geoAnalytics/__pycache__/scoreCalculator.cpython-312.pyc index 7cad266..e135a51 100644 Binary files a/geoAnalytics/__pycache__/scoreCalculator.cpython-312.pyc and b/geoAnalytics/__pycache__/scoreCalculator.cpython-312.pyc differ diff --git a/geoAnalytics/repository.py b/geoAnalytics/repository.py index a7d67f7..72663d1 100644 --- a/geoAnalytics/repository.py +++ b/geoAnalytics/repository.py @@ -328,6 +328,8 @@ def filter(self, filterFile): def calculate_scores_for_row(self, row): # Apply scoring only on relevant columns (from 3rd column onward) + # range(2, len(row)) is used to skip the first two columns (x and y) + # [j-2] is used to get the correct score object for the column because the first two columns are skipped aka x, y return [self.scores[j - 2].calculate_score(row[j]) for j in range(2, len(row))] @@ -344,6 +346,14 @@ def filtering(self, dataframe, filterFile): dataframe[[f'score_{i}' for i in range(score_columns.shape[1])]] = score_columns return dataframe + + def total_score(self, dataframe): + """ + Calculate the total score for the dataframe. + """ + # add a new column to the dataframe that contains the sum of all score columns divided by the number of score columns + num_score_cols = sum('score_' in col for col in dataframe.columns) + + dataframe['total_score'] = dataframe[[f'score_{i}' for i in range(1, num_score_cols)]].sum(axis=1) / num_score_cols - - + return dataframe diff --git a/geoAnalytics/scoreCalculator.py b/geoAnalytics/scoreCalculator.py index 3307cbd..b55e0b7 100644 --- a/geoAnalytics/scoreCalculator.py +++ b/geoAnalytics/scoreCalculator.py @@ -3,14 +3,18 @@ def __init__(self, min_val, max_val, avg_val): self.min_val = min_val self.max_val = max_val self.avg_val = avg_val - # Precompute the maximum possible distance from the average - self.max_distance = max(max_val - avg_val, avg_val - min_val) - + def calculate_score(self, value): - if value < self.min_val or value > self.max_val: + if value == self.avg_val: + return 0 + elif value == self.min_val or value == self.max_val: + return 0.5 + elif value > self.max_val or value < self.min_val: return 1 + elif value < self.avg_val: + # Linear interpolation between min_val and avg_val + return 0.5 * (1 - (value - self.min_val) / (self.avg_val - self.min_val)) else: - # Use the precomputed max distance - distance_from_avg = abs(value - self.avg_val) - score = distance_from_avg / self.max_distance - return score \ No newline at end of file + # Linear interpolation between avg_val and max_val + return 0.5 * (1 - (self.max_val - value) / (self.max_val - self.avg_val)) + diff --git a/geoAnalytics/testCase.ipynb b/geoAnalytics/testCase.ipynb index fe71946..797a551 100644 --- a/geoAnalytics/testCase.ipynb +++ b/geoAnalytics/testCase.ipynb @@ -77,18 +77,10 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "id": "139f3edf", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/tarunsreepada/Github/geoAnalytics/geoAnalytics/repository.py:303: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", - " df = pd.read_sql(sql, self.conn)\n" - ] - }, { "name": "stdout", "output_type": "stream", @@ -108,6 +100,14 @@ "\n", "[27000 rows x 5 columns]\n" ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/tarunsreepada/Github/geoAnalytics/geoAnalytics/repository.py:303: UserWarning: pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.\n", + " df = pd.read_sql(sql, self.conn)\n" + ] } ], "source": [ @@ -117,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "id": "d51ff67e", "metadata": {}, "outputs": [ @@ -125,7 +125,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/tarunsreepada/Github/geoAnalytics/geoAnalytics/repository.py:331: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", + "/Users/tarunsreepada/Github/geoAnalytics/geoAnalytics/repository.py:333: FutureWarning: Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`\n", " return [self.scores[j - 2].calculate_score(row[j]) for j in range(2, len(row))]\n" ] }, @@ -168,9 +168,9 @@ " 0.0\n", " 0.0\n", " 0.0\n", - " 0.086957\n", - " 0.626866\n", - " 0.386503\n", + " 0.068966\n", + " 0.313433\n", + " 0.193252\n", " \n", " \n", " 1\n", @@ -179,9 +179,9 @@ " 0.0\n", " 0.0\n", " 0.0\n", - " 0.086957\n", - " 0.626866\n", - " 0.386503\n", + " 0.068966\n", + " 0.313433\n", + " 0.193252\n", " \n", " \n", " 2\n", @@ -190,9 +190,9 @@ " 0.0\n", " 0.0\n", " 0.0\n", - " 0.086957\n", - " 0.626866\n", - " 0.386503\n", + " 0.068966\n", + " 0.313433\n", + " 0.193252\n", " \n", " \n", " 3\n", @@ -201,9 +201,9 @@ " 0.0\n", " 0.0\n", " 0.0\n", - " 0.086957\n", - " 0.626866\n", - " 0.386503\n", + " 0.068966\n", + " 0.313433\n", + " 0.193252\n", " \n", " \n", " 4\n", @@ -212,9 +212,9 @@ " 0.0\n", " 0.0\n", " 0.0\n", - " 0.086957\n", - " 0.626866\n", - " 0.386503\n", + " 0.068966\n", + " 0.313433\n", + " 0.193252\n", " \n", " \n", " ...\n", @@ -289,11 +289,11 @@ ], "text/plain": [ " x y b1 b2 b3 score_0 score_1 score_2\n", - "0 0.5 89.5 0.0 0.0 0.0 0.086957 0.626866 0.386503\n", - "1 1.5 89.5 0.0 0.0 0.0 0.086957 0.626866 0.386503\n", - "2 2.5 89.5 0.0 0.0 0.0 0.086957 0.626866 0.386503\n", - "3 3.5 89.5 0.0 0.0 0.0 0.086957 0.626866 0.386503\n", - "4 4.5 89.5 0.0 0.0 0.0 0.086957 0.626866 0.386503\n", + "0 0.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252\n", + "1 1.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252\n", + "2 2.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252\n", + "3 3.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252\n", + "4 4.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252\n", "... ... ... ... ... ... ... ... ...\n", "26995 295.5 0.5 65534.0 65534.0 65534.0 1.000000 1.000000 1.000000\n", "26996 296.5 0.5 65534.0 65534.0 65534.0 1.000000 1.000000 1.000000\n", @@ -304,7 +304,7 @@ "[27000 rows x 8 columns]" ] }, - "execution_count": 7, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -315,11 +315,47 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "d17f1842", "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " x y b1 b2 b3 score_0 score_1 score_2 \\\n", + "0 0.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252 \n", + "1 1.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252 \n", + "2 2.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252 \n", + "3 3.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252 \n", + "4 4.5 89.5 0.0 0.0 0.0 0.068966 0.313433 0.193252 \n", + "... ... ... ... ... ... ... ... ... \n", + "26995 295.5 0.5 65534.0 65534.0 65534.0 1.000000 1.000000 1.000000 \n", + "26996 296.5 0.5 65534.0 65534.0 65534.0 1.000000 1.000000 1.000000 \n", + "26997 297.5 0.5 65534.0 65534.0 65534.0 1.000000 1.000000 1.000000 \n", + "26998 298.5 0.5 65534.0 65534.0 65534.0 1.000000 1.000000 1.000000 \n", + "26999 299.5 0.5 65534.0 65534.0 65534.0 1.000000 1.000000 1.000000 \n", + "\n", + " total_score \n", + "0 0.168895 \n", + "1 0.168895 \n", + "2 0.168895 \n", + "3 0.168895 \n", + "4 0.168895 \n", + "... ... \n", + "26995 0.666667 \n", + "26996 0.666667 \n", + "26997 0.666667 \n", + "26998 0.666667 \n", + "26999 0.666667 \n", + "\n", + "[27000 rows x 9 columns]\n" + ] + } + ], + "source": [ + "print(obj.total_score(df))" + ] }, { "cell_type": "code", @@ -475,7 +511,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "Python 3", "language": "python", "name": "python3" },