diff --git a/notebooks/communityDetectionStructures.ipynb b/notebooks/communityDetectionStructures.ipynb new file mode 100644 index 00000000..5eaf8ba4 --- /dev/null +++ b/notebooks/communityDetectionStructures.ipynb @@ -0,0 +1,81871 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 158, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import requests\n", + "\n", + "import mercury\n", + "from dotenv import load_dotenv\n", + "\n", + "import networkx as nx\n", + "from netgraph import Graph, InteractiveGraph\n", + "from ipysigma import Sigma\n", + "from pyvis.network import Network\n", + "\n", + "import folium\n", + "\n", + "import pandas as pd\n", + "from pandas import json_normalize\n", + "\n", + "import matplotlib\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import itertools\n", + "from itertools import combinations" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "application/mercury+json": { + "allow_download": true, + "code_uid": "App.0.40.25.2-rande2e6d846", + "continuous_update": true, + "description": "", + "full_screen": true, + "model_id": "mercury-app", + "notify": "{}", + "output": "app", + "schedule": "", + "show_code": false, + "show_prompt": false, + "show_sidebar": true, + "static_notebook": true, + "stop_on_error": false, + "title": "Display JSON", + "widget": "App" + }, + "text/html": [ + "

Mercury Application

This output won't appear in the web app." + ], + "text/plain": [ + "mercury.App" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Load app for json display\n", + "mercury.App(title=\"Display JSON\", static_notebook=True)\n", + "\n", + "# Load server environment\n", + "load_dotenv(os.path.dirname(sys.path[1]) + '/server/.env')" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "# Elastic search info\n", + "SCANR_API_URL = os.environ.get('SCANR_API_URL')\n", + "SCANR_API_TOKEN = os.environ.get('SCANR_API_TOKEN')\n", + "header = {'Authorization': SCANR_API_TOKEN}" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "# Query json\n", + "json_query = {\n", + " \"size\": 5000,\n", + " \"_source\": [\n", + " \"id\",\n", + " \"authors\",\n", + " \"domains\",\n", + " \"title\",\n", + " \"year\",\n", + " \"isOa\",\n", + " \"type\",\n", + " \"affiliations\",\n", + " \"keywords\",\n", + " \"summary\",\n", + " \"alternativeSummary\"\n", + " ],\n", + " \"query\": {\n", + " \"bool\": {\n", + " \"filter\": [\n", + " {\"terms\": {\"authors.role.keyword\": [\"author\", \"directeurthese\"]}},\n", + " {\"range\": {\"year\": {\"gte\": \"2018\", \"lte\": \"2023\"}}},\n", + " {\"terms\": {\"affiliations.id.keyword\": [\"196012231\"]}},\n", + " ],\n", + " }\n", + " }\n", + "}\n", + "\n", + "# Request answer\n", + "json_answer = requests.post(SCANR_API_URL, json=json_query, headers=header).json()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Display json\n", + "mercury.JSON(json_answer)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of publications : 3510\n" + ] + } + ], + "source": [ + "# Get publications data\n", + "works = list(map(lambda x: x.get(\"_source\"), json_answer.get(\"hits\").get(\"hits\")))\n", + "print(f\"Number of publications : {len(works)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of publications filtered : 3405/3510\n" + ] + } + ], + "source": [ + "# Filter publications\n", + "max_affiliations = 20\n", + "works_filter = list(filter(lambda x: len(x.get(\"affiliations\")) < max_affiliations, works))\n", + "print(f\"Number of publications filtered : {len(works_filter)}/{len(works)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 182, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of nodes (structures) found : 1641\n" + ] + } + ], + "source": [ + "# Compute nodes (structures)\n", + "\n", + "nodes_dict = {}\n", + "for work in works_filter:\n", + " work_id = work.get(\"id\")\n", + " for affiliation in work.get(\"affiliations\") or {}:\n", + " affiliation_id = affiliation.get(\"id\")\n", + " # country = affiliation.get(\"address\")[0].get(\"country\") if (\"address\" in affiliation) else None\n", + " gps = affiliation.get(\"address\")[0].get(\"gps\") if (\"address\" in affiliation) else None\n", + " if affiliation_id and gps:\n", + " if affiliation_id in nodes_dict:\n", + " nodes_dict[affiliation_id][\"publications\"].append(work_id)\n", + " else:\n", + " nodes_dict[affiliation_id] = {\"id\": affiliation_id, \n", + " \"name\": affiliation.get(\"label\").get(\"en\") or affiliation.get(\"label\").get(\"default\"),\n", + " \"publications\": [work_id],\n", + " \"x\": gps.get(\"lon\"),\n", + " \"y\": gps.get(\"lat\")}\n", + "\n", + "nodes = list(nodes_dict.values())\n", + "print(f\"Number of nodes (structures) found : {len(nodes)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 183, + "metadata": {}, + "outputs": [], + "source": [ + "# Compute edges (publications)\n", + "edges = []\n", + "for source, target in combinations(nodes, 2):\n", + " similar_publications = set(source.get(\"publications\")) & set(target.get(\"publications\"))\n", + " if similar_publications:\n", + " edges.append({\"source\":source.get(\"id\"),\n", + " \"target\":target.get(\"id\"),\n", + " \"weight\": len(similar_publications)})" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "metadata": {}, + "outputs": [], + "source": [ + "# Create graph\n", + "G = nx.Graph()\n", + "\n", + "# Add nodes\n", + "for node in nodes:\n", + " G.add_node(node.get(\"id\"), label=node.get(\"name\"), weight=len(node.get(\"publications\")))\n", + "\n", + "# Add edges\n", + "min_weight_edge = 10\n", + "for edge in edges:\n", + " if edge.get(\"weight\") > min_weight_edge:\n", + " G.add_edge(edge.get(\"source\"), edge.get(\"target\"), weight=edge.get(\"weight\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Graph filtered : 100 \n", + "Minimum number of works required: 25\n" + ] + } + ], + "source": [ + "# Filter graph\n", + "max_order = 100\n", + "min_weight_node = 1\n", + "\n", + "while G.order() > max_order:\n", + " min_weight_node += 1\n", + " G = G.subgraph([node for node, attrdict in G.nodes.items() if attrdict.get(\"weight\") >= min_weight_node]) \n", + " # print(f\"Minimum number of works auto computed : {min_weight} (order={G.order()})\")\n", + "\n", + "print(f\"Graph filtered : {len(G.nodes) or 0} \\nMinimum number of works required: {min_weight_node}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "0d6ef99c0f354e77af5469a8d6f8cfbd", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Sigma(nx.Graph with 100 nodes and 540 edges)" + ] + }, + "execution_count": 199, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Use sigma widget\n", + "Sigma(G, node_size=G.degree, \n", + " node_metrics={\"community\": \"louvain\"}, \n", + " node_color=\"community\",\n", + " node_border_color_from=\"node\",\n", + " # layout=nodes_dict,\n", + " default_edge_type=\"curve\",\n", + " hide_edges_on_move=True,\n", + " start_layout=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 170, + "metadata": {}, + "outputs": [], + "source": [ + "structure_id = \"196012231\"\n", + "map = folium.Map(location=(nodes_dict[structure_id].get(\"y\"), nodes_dict[structure_id].get(\"x\")))" + ] + }, + { + "cell_type": "code", + "execution_count": 173, + "metadata": {}, + "outputs": [], + "source": [ + "edges_polylines = [([nodes_dict[e.get(\"source\")].get(\"y\"), nodes_dict[e.get(\"source\")].get(\"x\")],\n", + " [nodes_dict[e.get(\"target\")].get(\"y\"), nodes_dict[e.get(\"target\")].get(\"x\")])\n", + " for e in edges]" + ] + }, + { + "cell_type": "code", + "execution_count": 179, + "metadata": {}, + "outputs": [], + "source": [ + "for line in edges_polylines:\n", + " folium.PolyLine([line]).add_to(map)" + ] + }, + { + "cell_type": "code", + "execution_count": 180, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Make this Notebook Trusted to load map: File -> Trust Notebook
" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 180, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "map" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +}