diff --git a/.gitignore b/.gitignore index b6e4761..b7b65a7 100644 --- a/.gitignore +++ b/.gitignore @@ -127,3 +127,6 @@ dmypy.json # Pyre type checker .pyre/ + +# sensitive data +data/ diff --git a/README.md b/README.md index 61077fd..b765902 100644 --- a/README.md +++ b/README.md @@ -1,17 +1,17 @@ # eosc-recommender-metrics A framework for counting the recommender metrics -# Preprocessor v.1.0 +# Preprocessor v.0.2

- - + +

-# RS metrics v.1.0 +# RS metrics v.0.2

- - + +

@@ -20,13 +20,88 @@ A framework for counting the recommender metrics # Dependencies 1. Install Conda from here: https://docs.conda.io/projects/conda/en/latest/user-guide/install/linux.html. Tested on conda v 4.10.3. -2. Run from terminal: `conda env create -f rsmetrics_env.yml` +2. Run from terminal: `conda env create -f environment.yml` 3. Run from terminal: `conda activate rsmetrics` 4. Run from terminal: `chmod +x ./preprocessor.py ./rsmetrics.py` # Usage 7. Run from terminal: `./preprocessor.py` in order to prepare the data for the RSmetrics -8. Run from terminal: `./rsmetrics.py` to run RSmetrics +```bash + + _____ + | __ \ + | |__) | __ ___ _ __ _ __ ___ ___ ___ ___ ___ ___ _ __ + | ___/ '__/ _ \ '_ \| '__/ _ \ / __/ _ \/ __/ __|/ _ \| '__| + | | | | | __/ |_) | | | (_) | (_| __/\__ \__ \ (_) | | + |_| |_| \___| .__/|_| \___/ \___\___||___/___/\___/|_| + | | + |_| + +Version: 0.2 +© 2022, National Infrastructures for Research and Technology (GRNET) + +usage: preprocessor [-c [FILEPATH]] [-o [DIRPATH]] [-s [DATETIME]] [-e [DATETIME]] [-h] + [-v] + +Prepare data for the EOSC Marketplace RS metrics calculation + +optional arguments: + -c [FILEPATH], --config [FILEPATH] + override default configuration file (./config.yaml) + -o [DIRPATH], --output [DIRPATH] + override default output dir path (./data) + -s [DATETIME], --starttime [DATETIME] + process data starting from given datetime in ISO format (UTC) + e.g. YYYY-MM-DD + -e [DATETIME], --endtime [DATETIME] + process data ending to given datetime in ISO format (UTC) e.g. + YYYY-MM-DD + -h, --help show this help message and exit + -v, --version show program's version number and exit +``` + +8. Configure `./preprocessor.py` by editting the `config.yaml` or providing another with `-c`: +

+ + + +

+ + +9. Run from terminal: `./rsmetrics.py` to run RSmetrics +```bash + _____ _____ _ _ + | __ \ / ____| | | (_) + | |__) | (___ _ __ ___ ___| |_ _ __ _ ___ ___ + | _ / \___ \| '_ ` _ \ / _ \ __| '__| |/ __/ __| + | | \ \ ____) | | | | | | __/ |_| | | | (__\__ \ + |_| \_\_____/|_| |_| |_|\___|\__|_| |_|\___|___/ + +Version: 0.2 +© 2022, National Infrastructures for Research and Technology (GRNET) + +usage: rsmetrics [-i [FILEPATH]] [-s [DATETIME]] [-e [DATETIME]] [--users] [--services] + [-h] [-v] + +Calculate metrics for the EOSC Marketplace RS + +optional arguments: + -i [FILEPATH], --input [FILEPATH] + override default output dir (./data) + -s [DATETIME], --starttime [DATETIME] + calculate metrics starting from given datetime in ISO format + (UTC) e.g. YYYY-MM-DD + -e [DATETIME], --endtime [DATETIME] + calculate metrics ending to given datetime in ISO format (UTC) + e.g. YYYY-MM-DD + --users enable reading total users from users.csv, otherwise it will be + calculated according to the user actions + --services enable reading total services from services.csv, otherwise it + will be calculated according to the user actions + -h, --help show this help message and exit + -v, --version show program's version number and exit + +``` ## Reporting diff --git a/config.yaml b/config.yaml index 86fb814..6513966 100644 --- a/config.yaml +++ b/config.yaml @@ -5,16 +5,30 @@ Source: port: 27017 db: recommender_dev -# Use the EOSC-Marketplace webpage -# to associate page_id and service_id -Marketplace: +User: + export: true + #from: 'user_actions' + #from: 'recommendations' + from: 'source' + +Service: + # Use the EOSC-Marketplace webpage + # to associate page_id and service_id download: true path: ./page_map -#Reward: -# transition: ./transition_rewards.csv + export: true + #from: 'user_actions' + #from: 'recommendations' + from: 'source' + #from: 'page_map' + + published: false # applies only on source option + +User-actions: + merge: false # not implemented yet -# Calculate connector's metrics +# Calculate source's metrics Metrics: true diff --git a/docs/Preprocessor.png b/docs/Preprocessor.png index f588bc7..3544397 100644 Binary files a/docs/Preprocessor.png and b/docs/Preprocessor.png differ diff --git a/docs/Preprocessor.png.old b/docs/Preprocessor.png.old new file mode 100644 index 0000000..f588bc7 Binary files /dev/null and b/docs/Preprocessor.png.old differ diff --git a/docs/RSmetrics.png b/docs/RSmetrics.png index 006087a..08cb3d6 100644 Binary files a/docs/RSmetrics.png and b/docs/RSmetrics.png differ diff --git a/docs/RSmetrics.png.old b/docs/RSmetrics.png.old new file mode 100644 index 0000000..006087a Binary files /dev/null and b/docs/RSmetrics.png.old differ diff --git a/docs/preprocessor-config.png b/docs/preprocessor-config.png new file mode 100644 index 0000000..84a160d Binary files /dev/null and b/docs/preprocessor-config.png differ diff --git a/environment.yml b/environment.yml index 7ac05df..db48d7e 100644 --- a/environment.yml +++ b/environment.yml @@ -4,7 +4,7 @@ channels: dependencies: - _libgcc_mutex=0.1=main - _openmp_mutex=4.5=1_gnu - - ca-certificates=2022.3.29=h06a4308_0 + - ca-certificates=2022.3.18=h06a4308_0 - certifi=2021.10.8=py39h06a4308_2 - ld_impl_linux-64=2.35.1=h7274673_9 - libffi=3.3=he6710b0_2 @@ -17,16 +17,28 @@ dependencies: - python=3.9.11=h12debd9_2 - readline=8.1.2=h7f8727e_1 - setuptools=58.0.4=py39h06a4308_0 - - sqlite=3.38.2=hc218d9a_0 + - sqlite=3.38.0=hc218d9a_0 - tk=8.6.11=h1ccaba5_0 - - tzdata=2022a=hda174b7_0 + - tzdata=2021e=hda174b7_0 - wheel=0.37.1=pyhd3eb1b0_0 - xz=5.2.5=h7b6447c_0 - zlib=1.2.11=h7f8727e_4 - pip: + - beautifulsoup4==4.10.0 - charset-normalizer==2.0.12 - idna==3.3 - - pymongo==4.0.2 + - joblib==1.1.0 + - natsort==8.1.0 + - numpy==1.22.3 + - pandas==1.4.2 + - pymongo==4.1.0 + - python-dateutil==2.8.2 + - pytz==2022.1 - pyyaml==6.0 - requests==2.27.1 + - scikit-surprise==1.1.1 + - scipy==1.8.0 + - six==1.16.0 + - soupsieve==2.3.2 + - surprise==0.1 - urllib3==1.26.9 diff --git a/get_service_catalog.py b/get_service_catalog.py index e175c46..d3423ac 100755 --- a/get_service_catalog.py +++ b/get_service_catalog.py @@ -7,43 +7,82 @@ -# Main logic -def main(args=None): - # call eosc marketplace with ample number of services per page: default = 1000 - url = "https://marketplace.eosc-portal.eu/services?page=1&per_page={}".format(str(args.items)) - - print("Retrieving page: marketplace list of services... \nGrabbing url: {0}".format(url)) +def get_eosc_marketplace_url(num_of_items=1000): + """Constructs the EOSC Marketplace URL to grab the complete service catalog (list of available services) in one request. + + Args: + num_of_items (int, optional): Number of items per page to be used as an url argument when contacting EOSC Marketplace webpage to grab all services in one take. Defaults to 1000. + + Returns: + string: EOSC marketplace url along with the neccessary url parameters to grab the list of all available services + """ + url = "https://marketplace.eosc-portal.eu/services?page=1&per_page={}".format( + str(num_of_items)) + return url + + +# Contacts eosc marketplace page to retrieve the complete list of items in a single tak +def get_service_catalog_page_content(url): + """Returns the HTML Page content of EOSC Marketplace Service Page catalog + + Args: + url (string): url to EOSC Marketplace Service list + + Returns: + bytes: html content of the eosc marketplace service list page + """ page = requests.get(url) + return page.content - print("Page retrieved!\nGenerating results...") - soup = BeautifulSoup(page.content, 'html.parser') +def get_service_catalog_items(content): + """Parses EOSC Marketplace service list html page and extracts all active services. + Each service is described by a list of three items: [service_id, service_name, service_path] - # Find all h2 that contain the data-e2e attribute equal to service-id - results = soup.findAll("h2", {"data-e2e":"service-id"}) + Args: + content (bytes): Html content of EOSC Marketplace page containing the complete list of available services + + Returns: + list of lists: A list of service entries. Each service entry is a three-item list containing: [service_id, service_name, service_path] + """ rows = [] - # populate rows with each row = [service id, service name, service path] + soup = BeautifulSoup(content, 'html.parser') + results = soup.findAll("h2", {"data-e2e": "service-id"}) for item in results: - a = item.findChildren("a",recursive=False)[0] - row = [int(item.attrs["data-service-id"]),item.text.strip(),a['href']] + a = item.findChildren("a", recursive=False)[0] + row = [int(item.attrs["data-service-id"]), + item.text.strip(), a['href']] rows.append(row) - # sort rows by id + # sort rows by id rows = sorted(rows, key=lambda x: x[0]) - - # output to csv - with open(args.output, "w") as f: + return rows + +def save_service_items_to_csv(items, output): + with open(output, "w") as f: writer = csv.writer(f) - writer.writerows(rows) - + writer.writerows(items) + +# Main logic +def main(args=None): + # call eosc marketplace with ample number of services per page: default = 1000 + url = get_eosc_marketplace_url(args.items) + print( + "Retrieving page: marketplace list of services... \nGrabbing url: {0}".format(url)) + page_content = get_service_catalog_page_content(url) + print("Page retrieved!\nGenerating results...") + results = get_service_catalog_items(page_content) + # output to csv + save_service_items_to_csv(results, args.output) print("File written to {}".format(args.output)) -# Parse arguments and call main +# Parse arguments and call main if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Retrieve service catalog from eosc marketplace") + parser = argparse.ArgumentParser( + description="Retrieve service catalog from eosc marketplace") parser.add_argument( "-n", "--num-of-items", metavar="STRING", help="Number of items per page", required=False, dest="items", default="1000") parser.add_argument( "-o", "--output", metavar="STRING", help="Output csv file", required=False, dest="output", default="./service_catalog.csv") # Parse the arguments - sys.exit(main(parser.parse_args())) \ No newline at end of file + sys.exit(main(parser.parse_args())) diff --git a/metrics.py b/metrics.py new file mode 100644 index 0000000..f901135 --- /dev/null +++ b/metrics.py @@ -0,0 +1,208 @@ +#!/usr/bin/env python3 +import pandas as pd +import numpy as np + + +class Runtime: + def __init__(self): + self.users=None + self.services=None + self.user_actions=None + self.recommendations=None + +# decorator to add the text attribute to function +def doc(r): + def wrapper(f): + f.text = r + return f + return wrapper + + +# Metrics + +@doc('The total number of unique users found in users.csv (if provided), otherwise in user_actions.csv') +def users(object): + """ + Calculate the total number of unique users + found in Pandas DataFrame object users (if provided) + or user_actions otherwise + """ + if isinstance(object.users, pd.DataFrame): + return int(object.users.nunique()['User']) + else: + return int(object.user_actions.nunique()['User']) + + +@doc('The total number of unique services found in services.csv (if provided), otherwise in user_actions.csv') +def services(object): + """ + Calculate the total number of unique services + found in Pandas DataFrame object services (if provided) + or user_actions otherwise + """ + if isinstance(object.services, pd.DataFrame): + return int(object.services.nunique()['Service']) + else: + return int(object.user_actions.nunique()['Service']) + + +@doc('The total number of recommendations found in recommendations.csv') +def recommendations(object): + """ + Calculate the total number of recommendations + found in Pandas DataFrame object recommendations + """ + return len(object.recommendations.index) + + +@doc('The total number of user actions found in user_actions.csv') +def user_actions(object): + """ + Calculate the total number of user_actions + found in Pandas DataFrame object user_actions + """ + return len(object.user_actions.index) + + +@doc('The total number of user actions occurred by registered users found in user_actions.csv') +def user_actions_registered(object): + """ + Calculate the total number of user_actions occurred by registered users + found in Pandas DataFrame object user_actions + """ + return len(object.user_actions[object.user_actions['User'] != -1].index) + + +@doc('The total number of user actions occurred by anonymous users found in user_actions.csv') +def user_actions_anonymous(object): + """ + Calculate the total number of user_actions occurred by anonymous users + found in Pandas DataFrame object user_actions + """ + return user_actions(object)-user_actions_registered(object) + + +@doc('The percentage (%) of user actions occurred by registered users to the total user actions') +def user_actions_registered_perc(object): + """ + Calculate the percentage (%) of user actions occurred + by registered users to the total user actions + found in Pandas DataFrame object user_actions (in two decimals) + """ + return round(user_actions_registered(object)*100.0/user_actions(object),2) + + +@doc('The percentage (%) of user actions occurred by anonymous users to the total user actions') +def user_actions_anonymous_perc(object): + """ + Calculate the percentage (%) of user actions occurred + by anonymous users to the total user actions + found in Pandas DataFrame object user_actions (in two decimals) + """ + return round(100.0-user_actions_registered_perc(object),2) + + +@doc('The total number of user actions led to order found in user_actions.csv') +def user_actions_order(object): + """ + Calculate the total number of user_actions led to order + found in Pandas DataFrame object user_actions + """ + return len(object.user_actions[object.user_actions['Reward'] == 1.0].index) + + +@doc('The total number of user actions led to order by registered users found in user_actions.csv') +def user_actions_order_registered(object): + """ + Calculate the total number of user_actions led to order by registered users + found in Pandas DataFrame object user_actions + """ + return len(object.user_actions[(object.user_actions['Reward'] == 1.0) & (object.user_actions['User'] != -1)].index) + + +@doc('The total number of user actions led to order by anonymous users found in user_actions.csv') +def user_actions_order_anonymous(object): + """ + Calculate the total number of user_actions led to order by anonymous users + found in Pandas DataFrame object user_actions + """ + return user_actions_order(object)-user_actions_order_registered(object) + + +@doc('The percentage (%) of user actions occurred by registered users and led to order to the total user actions that led to order') +def user_actions_order_registered_perc(object): + """ + Calculate the percentage (%) of user actions occurred + by registered users and led to order to the total user actions that led to order + found in Pandas DataFrame object user_actions (in two decimals) + """ + return round(user_actions_order_registered(object)*100.0/user_actions_order(object),2) + + +@doc('The percentage (%) of user actions occurred by anonymous users and led to order to the total user actions that led to order') +def user_actions_order_anonymous_perc(object): + """ + Calculate the percentage (%) of user actions occurred + by anonymous users and led to order to the total user actions that led to order + found in Pandas DataFrame object user_actions (in two decimals) + """ + return round(100.0-user_actions_order_registered_perc(object),2) + + +@doc('The total number of user actions assosicated with the recommendation panel found in user_actions.csv') +def user_actions_panel(object): + """ + Calculate the total number of user_actions assosicated with the recommendation panel + found in Pandas DataFrame object user_actions + """ + return len(object.user_actions[object.user_actions['Action'] == 'recommendation_panel'].index) + + +@doc('The percentage (%) of user actions assosicated with the recommendation panel to the total user actions') +def user_actions_panel_perc(object): + """ + Calculate the percentage (%) of user actions assosicated with + the recommendation panel to the total user actions + found in Pandas DataFrame object user_actions (in two decimals) + """ + return round(user_actions_panel(object)*100.0/user_actions(object),2) + + +@doc('The total number of unique services found in recommendations.csv') +def catalog_coverage(object): + """ + Calculate the total number of unique services + found in recommendations.csv + """ + return int(object.recommendations.nunique()['Service']) + + +@doc('The percentage (%) of unique services found in recommedations.csv to the total number of services (provided or found otherwise in user_actions.csv)') +def catalog_coverage_perc(object): + """ + Calculate the percentage (%) of unique services + found in recommedations.csv to the total number + of services (provided or found otherwise in user_actions.csv) + """ + return round(catalog_coverage(object)*100.0/services(object),2) + + +@doc('The total number of unique users found in recommendations.csv') +def user_coverage(object): + """ + Calculate the total number of unique users + found in recommendations.csv + """ + return int(object.recommendations.nunique()['User']) + + +@doc('The percentage (%) of unique users found in recommedations.csv to the total number of users (provided or found otherwise in user_actions.csv)') +def user_coverage_perc(object): + """ + Calculate the percentage (%) of unique users + found in recommedations.csv to the total number + of users (provided or found otherwise in user_actions.csv) + """ + return round(user_coverage(object)*100.0/users(object),2) + + diff --git a/preprocessor.py b/preprocessor.py index d7886cb..3be128e 100755 --- a/preprocessor.py +++ b/preprocessor.py @@ -5,15 +5,18 @@ import pymongo from datetime import datetime, timezone import os +from natsort import natsorted +import natsort as ns import retrieval import reward_mapping as rm +from get_service_catalog import get_eosc_marketplace_url, get_service_catalog_items, get_service_catalog_page_content, save_service_items_to_csv __copyright__ = "© "+str(datetime.utcnow().year)+", National Infrastructures for Research and Technology (GRNET)" __status__ = "Production" -__version__ = "1.0" +__version__ = "0.2" os.environ['COLUMNS'] = "90" @@ -31,13 +34,24 @@ def inner(): |_| """) print('Version: ' + __version__) - print('License: ' + __license__) print( __copyright__+'\n') func() return inner +def remove_service_prefix(text): + """Removes '/service/' prefix from eosc service paths -parser = argparse.ArgumentParser(prog='rsmetrics', description='Prepare data for the EOSC Marketplace RS metrics calculation', add_help=False) + Args: + text (string): string containing a service path + + Returns: + string: service path without the /service/ prefix + """ + if text.startswith('/service/'): + return text[len('/service/'):] + return text + +parser = argparse.ArgumentParser(prog='preprocessor', description='Prepare data for the EOSC Marketplace RS metrics calculation', add_help=False) parser.print_help=print_help(parser.print_help) parser._action_groups.pop() required = parser.add_argument_group('required arguments') @@ -72,9 +86,10 @@ def __init__(self, source_page_id, target_page_id, order): self.target.page_id=target_page_id self.action.order=order -m=Metrics() +m=Metrics() + reward_mapping = { "order": 1.0, "interest": 0.7, @@ -117,47 +132,25 @@ def __init__(self, source_page_id, target_page_id, order): # automatically associate page ids to service ids -if config['Marketplace']['download']: - map_pages=[] - - for ua in recdb["user_action"].find(query): - - # remove page_id that do not mean a specific service - path = ua['target']['page_id'].split("/") - - try: - if not (path[1]=="services" and (not path[2]=="c")): - continue - except: - continue - - # keep only the service name (remove prefix /services and suffix e.g. summary) - map_pages.append("/".join(path[2:3])) - - write_pages=[] - - # keep unique pages - map_pages=sorted(list(set(map_pages))) - - for page in map_pages: - - service_id=retrieval.retrieve_id(page) - - if not service_id == None and page: - # print("("+page+")",service_id) # temp - write_pages.append('{},{}\n'.format(page, service_id)) - - - with open(os.path.join(args.output,config['Marketplace']['path']), 'w') as outfile: - outfile.writelines(write_pages) +if config['Service']['download']: + service_list_path = os.path.join(args.output,config['Service']['path']) + eosc_url = get_eosc_marketplace_url() + print( + "Retrieving page: marketplace list of services... \nGrabbing url: {0}".format(eosc_url)) + eosc_page_content = get_service_catalog_page_content(eosc_url) + print("Page retrieved!\nGenerating results...") + eosc_service_results = get_service_catalog_items(eosc_page_content) + # output to csv + save_service_items_to_csv(eosc_service_results, service_list_path) + print("File written to {}".format(service_list_path)) # read map file and save in dict -with open(os.path.join(args.output,config['Marketplace']['path']), 'r') as f: +with open(os.path.join(args.output,config['Service']['path']), 'r') as f: lines=f.readlines() -keys=list(map(lambda x: x.split(',')[0].strip(), lines)) -values=list(map(lambda x: x.split(',')[1].strip(), lines)) +keys=list(map(lambda x: remove_service_prefix(x.split(',')[2]).strip(), lines)) +values=list(map(lambda x: x.split(',')[0].strip(), lines)) dmap=dict(zip(keys, values)) #=> {'a': 1, 'b': 2} @@ -166,7 +159,6 @@ def __init__(self, source_page_id, target_page_id, order): for ua in recdb["user_action"].find(query): - # print(ua) # set -1 to anonymous users try: user=ua['user'] @@ -175,7 +167,9 @@ def __init__(self, source_page_id, target_page_id, order): # process data that map from page id to service id exist try: - service_id=dmap["/".join(ua['target']['page_id'].split('/')[2:3])] + _pageid="/"+"/".join(ua['target']['page_id'].split('/')[1:3]) + service_id=dmap[_pageid] + except: continue @@ -197,14 +191,14 @@ def __init__(self, source_page_id, target_page_id, order): luas=[] -for user,_ in sorted(uas.items()): - for service,act in sorted(uas[user].items()): +for user,_ in natsorted(uas.items(),alg=ns.ns.SIGNED): + for service,act in natsorted(uas[user].items(),alg=ns.ns.SIGNED): if service: luas.append('{},{},{},{},{}\n'.format(user, service, *act)) -with open(os.path.join(args.output,'dataset.csv'), 'w') as o: +with open(os.path.join(args.output,'user_actions.csv'), 'w') as o: o.writelines(luas) @@ -219,16 +213,56 @@ def __init__(self, source_page_id, target_page_id, order): for service in rec['services']: recs.append('{},{},{},{}\n'.format(user, service, '1', rec['timestamp'])) -recs=sorted(recs) +recs=natsorted(recs,alg=ns.ns.SIGNED) with open(os.path.join(args.output,'recommendations.csv'), 'w') as o: o.writelines(recs) +# export user catalog +if config['User']['export']: + + if config['User']['from']=='user_actions': + us=natsorted(list(set(list(map(lambda x: x.split(',')[0]+'\n',luas)))),alg=ns.ns.SIGNED) + + elif config['User']['from']=='recommendations': + us=natsorted(list(set(list(map(lambda x: x.split(',')[0]+'\n',recs)))),alg=ns.ns.SIGNED) + + else: # 'source' + us=natsorted(list(set(list(map(lambda x: str(x['_id'])+'\n',recdb["user"].find({}))))),alg=ns.ns.SIGNED) + + with open(os.path.join(args.output,'users.csv'), 'w') as o: + o.writelines(us) + + +# export service catalog +if config['Service']['export']: + + if config['Service']['from']=='user_actions': + ss=natsorted(list(set(list(map(lambda x: x.split(',')[1]+'\n',luas)))),alg=ns.ns.SIGNED) + + elif config['Service']['from']=='recommendations': + ss=natsorted(list(set(list(map(lambda x: x.split(',')[1]+'\n',recs)))),alg=ns.ns.SIGNED) + + elif config['Service']['from']=='page_map': + ss=natsorted(list(set(list(map(lambda x: x+'\n',values)))),alg=ns.ns.SIGNED) + + else: # 'source' + if config['Service']['published']: + ss=natsorted(list(set(list(map(lambda x: str(x['_id'])+'\n',recdb["service"].find({"status":"published"}))))),alg=ns.ns.SIGNED) + else: + ss=natsorted(list(set(list(map(lambda x: str(x['_id'])+'\n',recdb["service"].find({}))))),alg=ns.ns.SIGNED) + + with open(os.path.join(args.output,'services.csv'), 'w') as o: + o.writelines(ss) + + # calculate pre metrics if config['Metrics']: time_range=recdb["user_action"].distinct("timestamp", query) + m.timestamp=str(datetime.utcnow()) + m.users=recdb["user"].count_documents({}) m.recommendations=recdb["recommendation"].count_documents(query) m.services=recdb["service"].count_documents({}) @@ -248,14 +282,14 @@ def __init__(self, source_page_id, target_page_id, order): m.user_actions_panel=recdb["user_action"].count_documents({**query, **{"source.root.type":"recommendation_panel"}}) m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2) - m.services_suggested=len(recdb["recommendation"].distinct("services", query)) + m.service_catalog=len(recdb["recommendation"].distinct("services", query)) # catalog coverage - m.services_suggested_perc=round(m.services_suggested*100.0/m.services,2) + m.service_catalog_perc=round(m.service_catalog*100.0/m.services,2) # user coverage - m.users_suggested=len(recdb["user_action"].distinct("user", query)) - m.users_suggested_perc=round(m.users_suggested*100.0/m.users,2) + m.user_catalog=len(recdb["user_action"].distinct("user", query)) + m.user_catalog_perc=round(m.user_catalog*100.0/m.users,2) jsonstr = json.dumps(m.__dict__) print(jsonstr) diff --git a/report.html.prototype b/report.html.prototype index 911aeb2..4e29b9b 100644 --- a/report.html.prototype +++ b/report.html.prototype @@ -105,7 +105,7 @@
-

User Coverage: %

+

User Coverage: %

@@ -113,7 +113,7 @@
-

Catalog Coverage: %

+

Catalog Coverage: %

@@ -131,7 +131,7 @@ function fill(data){ // what to fill - let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered', 'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc', 'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc', 'services_suggested_perc', 'users_suggested_perc']; + let fill_list = ['users', 'recommendations', 'services', 'user_actions','user_actions_order', 'user_actions_registered', 'user_actions_anonymous', 'user_actions_order_registered', 'user_actions_order_anonymous', 'user_actions_registered_perc', 'user_actions_anonymous_perc', 'user_actions_order_registered_perc', 'user_actions_order_anonymous_perc', 'catalog_coverage_perc', 'user_coverage_perc']; for (item of fill_list) { key = 'val_' + item @@ -150,4 +150,4 @@ function fill(data){ - \ No newline at end of file + diff --git a/requirements.txt b/requirements.txt index e5a5057..688721f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,6 +2,8 @@ beautifulsoup4==4.10.0 certifi==2021.10.8 charset-normalizer==2.0.12 idna==3.3 +joblib==1.1.0 +natsort==8.1.0 numpy==1.22.3 pandas==1.4.2 pymongo==4.1.0 @@ -9,6 +11,9 @@ python-dateutil==2.8.2 pytz==2022.1 PyYAML==6.0 requests==2.27.1 +scikit-surprise==1.1.1 +scipy==1.8.0 six==1.16.0 -soupsieve==2.3.1 +soupsieve==2.3.2 +surprise==0.1 urllib3==1.26.9 diff --git a/rsmetrics.py b/rsmetrics.py index c44effb..129bb42 100755 --- a/rsmetrics.py +++ b/rsmetrics.py @@ -5,12 +5,16 @@ from datetime import datetime, timezone import os import pandas as pd +from inspect import getmembers, isfunction + +# local lib +import metrics as m __copyright__ = "© "+str(datetime.utcnow().year)+", National Infrastructures for Research and Technology (GRNET)" __status__ = "Production" -__version__ = "1.0" +__version__ = "0.2" os.environ['COLUMNS'] = "90" @@ -25,7 +29,6 @@ def inner(): |_| \_\_____/|_| |_| |_|\___|\__|_| |_|\___|___/ """) print('Version: ' + __version__) - print('License: ' + __license__) print( __copyright__+'\n') func() return inner @@ -43,8 +46,8 @@ def inner(): optional.add_argument('-s', '--starttime', metavar=('DATETIME'), help='calculate metrics starting from given datetime in ISO format (UTC) e.g. YYYY-MM-DD', nargs='?', default=None) optional.add_argument('-e', '--endtime', metavar=('DATETIME'), help='calculate metrics ending to given datetime in ISO format (UTC) e.g. YYYY-MM-DD', nargs='?', default=None) -optional.add_argument('--users', metavar=('INT'), help='provide the total number of users otherwise it will be calculated according to the user actions', type=int, default=None) -optional.add_argument('--services', metavar=('INT'), help='provide the total number of services otherwise it will be calculated according to the user actions', type=int, default=None) +optional.add_argument('--users', help='enable reading total users from users.csv, otherwise it will be calculated according to the user actions', action='store_true', default=False) +optional.add_argument('--services', help='enable reading total services from services.csv, otherwise it will be calculated according to the user actions', action='store_true', default=False) optional.add_argument('-h', '--help', action='help', help='show this help message and exit') optional.add_argument('-v', '--version', action='version', version='%(prog)s v'+__version__) @@ -53,10 +56,7 @@ def inner(): args=parser.parse_args() -class Metrics: - pass - -m=Metrics() +run=m.Runtime() if args.starttime: args.starttime=datetime.fromisoformat(args.starttime) @@ -73,59 +73,85 @@ class Metrics: sys.exit(0) # read data -uas=pd.read_csv(os.path.join(args.input,'dataset.csv'),names=["User", "Service", "Reward", "Action", "Timestamp"]) -recs=pd.read_csv(os.path.join(args.input,'recommendations.csv'),names=["User", "Service", "Rating", "Timestamp"]) +run.user_actions=pd.read_csv(os.path.join(args.input,'user_actions.csv'),names=["User", "Service", "Reward", "Action", "Timestamp"]) +run.recommendations=pd.read_csv(os.path.join(args.input,'recommendations.csv'),names=["User", "Service", "Rating", "Timestamp"]) # convert timestamp column to datetime object -uas['Timestamp']= pd.to_datetime(uas['Timestamp']) -recs['Timestamp']= pd.to_datetime(recs['Timestamp']) +run.user_actions['Timestamp']= pd.to_datetime(run.user_actions['Timestamp']) +run.recommendations['Timestamp']= pd.to_datetime(run.recommendations['Timestamp']) # restrict data to datetime range if args.starttime: - uas=uas[(uas['Timestamp'] > args.starttime) & (uas['Timestamp'] < args.endtime)] - recs=recs[(recs['Timestamp'] > args.starttime) & (recs['Timestamp'] < args.endtime)] + run.user_actions=run.user_actions[(run.user_actions['Timestamp'] > args.starttime) & (run.user_actions['Timestamp'] < args.endtime)] + run.recommendations=run.recommendations[(run.recommendations['Timestamp'] > args.starttime) & (run.recommendations['Timestamp'] < args.endtime)] else: - uas=uas[uas['Timestamp'] < args.endtime] - recs=recs[recs['Timestamp'] < args.endtime] + run.user_actions=run.user_actions[run.user_actions['Timestamp'] < args.endtime] + run.recommendations=run.recommendations[run.recommendations['Timestamp'] < args.endtime] + +# populate users and services +# if no users or services provided use +# respective columns found in user_actions instead +if args.users: + run.users=pd.read_csv(os.path.join(args.input,'users.csv'),names=["User"]) + +if args.services: + run.services=pd.read_csv(os.path.join(args.input,'services.csv'),names=["Service"]) + + +md={'timestamp':str(datetime.utcnow())} + +# get all functions found in metrics module +# apart from 'doc' func +# run and save the result in dictionary +# where key is the name of the function +# and value what it returns +# whereas, for each found functions +# an extra key_doc element in dictionary is set +# to save the text of the function +funcs = list(map(lambda x: x[0], getmembers(m, isfunction))) +funcs = list(filter(lambda x: not x=='doc',funcs)) +for func in funcs: + md[func+'_doc']=getattr(m, func).text + md[func]=getattr(m, func)(run) # get uniq values per column of user actions -uniq_uas=uas.nunique() -uniq_recs=recs.nunique() +#uniq_uas=uas.nunique() +#uniq_recs=recs.nunique() -m.users=int(uniq_uas['User']) if not args.users else args.users -m.services=int(uniq_uas['Service']) if not args.services else args.services +#m.users=int(uniq_uas['User']) if not args.users else int(us['User'].nunique()) +#m.services=int(uniq_uas['Service']) if not args.services else int(ss['Service']) -m.recommendations=len(recs.index) -m.user_actions=len(uas.index) +#m.recommendations=len(recs.index) +#m.user_actions=len(uas.index) -m.user_actions_registered=len(uas[uas['User'] != -1].index) -m.user_actions_anonymous=m.user_actions-m.user_actions_registered +#m.user_actions_registered=len(uas[uas['User'] != -1].index) +#m.user_actions_anonymous=m.user_actions-m.user_actions_registered -m.user_actions_registered_perc=round(m.user_actions_registered*100.0/m.user_actions,2) -m.user_actions_anonymous_perc=100-m.user_actions_registered_perc +#m.user_actions_registered_perc=round(m.user_actions_registered*100.0/m.user_actions,2) +#m.user_actions_anonymous_perc=100-m.user_actions_registered_perc -m.user_actions_order=len(uas[uas['Reward'] == 1.0].index) +#m.user_actions_order=len(uas[uas['Reward'] == 1.0].index) -m.user_actions_order_registered=len(uas[(uas['Reward'] == 1.0) & (uas['User'] != -1)].index) -m.user_actions_order_anonymous=m.user_actions_order-m.user_actions_order_registered -m.user_actions_order_registered_perc=round(m.user_actions_order_registered*100.0/m.user_actions_order,2) -m.user_actions_order_anonymous_perc=100-m.user_actions_order_registered_perc +#m.user_actions_order_registered=len(uas[(uas['Reward'] == 1.0) & (uas['User'] != -1)].index) +#m.user_actions_order_anonymous=m.user_actions_order-m.user_actions_order_registered +#m.user_actions_order_registered_perc=round(m.user_actions_order_registered*100.0/m.user_actions_order,2) +#m.user_actions_order_anonymous_perc=100-m.user_actions_order_registered_perc -m.user_actions_panel=len(uas[uas['Action'] == 'recommendation_panel'].index) -m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2) +#m.user_actions_panel=len(uas[uas['Action'] == 'recommendation_panel'].index) +#m.user_actions_panel_perc=round(m.user_actions_panel*100.0/m.user_actions,2) -m.services_suggested=int(uniq_recs['Service']) +#m.services_suggested=int(uniq_recs['Service']) # catalog coverage -m.services_suggested_perc=round(m.services_suggested*100.0/m.services,2) +#m.services_suggested_perc=round(m.services_suggested*100.0/m.services,2) # user coverage -m.users_suggested=int(uniq_recs['User']) -m.users_suggested_perc=round(m.users_suggested*100.0/m.users,2) - +#m.users_suggested=int(uniq_recs['User']) +#m.users_suggested_perc=round(m.users_suggested*100.0/m.users,2) -jsonstr = json.dumps(m.__dict__) +jsonstr = json.dumps(md) +#jsonstr = json.dumps(m.__dict__) print(jsonstr) # Using a JSON string