diff --git a/examples/community-detection.ipynb b/examples/community-detection.ipynb new file mode 100644 index 000000000..e288d8329 --- /dev/null +++ b/examples/community-detection.ipynb @@ -0,0 +1,1160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5b60d8ba", + "metadata": {}, + "source": [ + "# Community Detection" + ] + }, + { + "cell_type": "markdown", + "id": "3e2fb927", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { + "cell_type": "markdown", + "id": "f083f11b", + "metadata": {}, + "source": [ + "This Jupyter notebook is hosted [here](https://github.com/neo4j/graph-data-science-client/blob/main/examples/community-detection.ipynb) in the Neo4j Graph Data Science Client Github repository.\n", + "\n", + "The notebook shows the usage of the `graphdatascience` library for community detection on the Reddit Hyperlink Network dataset that can be downloaded [here](https://snap.stanford.edu/data/soc-RedditHyperlinks.html). We will use the `soc-redditHyperlinks-body.tsv` file.\n", + "\n", + "The tasks we cover here include performing initial graph preprocessing using Weakly Connected Components and then performing community detection on the largest component using the Louvain algorithm.\n", + "\n", + "### Setup\n", + "\n", + "We need to import the following libraries:\n", + "- graphdatascience\n", + "- neo4j\n", + "- pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3953e353", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\kedar\\anaconda3\\envs\\graph_stuff\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from graphdatascience import GraphDataScience\n", + "from neo4j import GraphDatabase\n", + "from neo4j.exceptions import ServiceUnavailable\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "46b33d2d", + "metadata": {}, + "outputs": [], + "source": [ + "NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n", + "NEO4J_AUTH = None\n", + "if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n", + " NEO4J_AUTH = (\n", + " os.environ.get(\"NEO4J_USER\"),\n", + " os.environ.get(\"NEO4J_PASSWORD\"),\n", + " )\n", + "\n", + "gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)" + ] + }, + { + "cell_type": "markdown", + "id": "48bd8af1", + "metadata": {}, + "source": [ + "### Importing the dataset\n", + "\n", + "We import the dataset as a pandas dataframe first. We work with only a subset of the dataset. The sampled data is only till 1st March 2014. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a8e677aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SOURCE_SUBREDDITTARGET_SUBREDDITPOST_IDTIMESTAMPLINK_SENTIMENTPROPERTIES
0leagueoflegendsteamredditteams1u4nrps2013-12-31 16:39:581345.0,298.0,0.75652173913,0.0173913043478,0.08...
1theredlionsoccer1u4qkd2013-12-31 18:18:37-1101.0,98.0,0.742574257426,0.019801980198,0.049...
2inlandempirebikela1u4qlzs2014-01-01 14:54:35185.0,85.0,0.752941176471,0.0235294117647,0.082...
3nflcfb1u4sjvs2013-12-31 17:37:5511124.0,949.0,0.772241992883,0.0017793594306,0....
4playmygamegamedev1u4w5ss2014-01-01 02:51:131715.0,622.0,0.777622377622,0.00699300699301,0....
\n", + "
" + ], + "text/plain": [ + " SOURCE_SUBREDDIT TARGET_SUBREDDIT POST_ID TIMESTAMP \\\n", + "0 leagueoflegends teamredditteams 1u4nrps 2013-12-31 16:39:58 \n", + "1 theredlion soccer 1u4qkd 2013-12-31 18:18:37 \n", + "2 inlandempire bikela 1u4qlzs 2014-01-01 14:54:35 \n", + "3 nfl cfb 1u4sjvs 2013-12-31 17:37:55 \n", + "4 playmygame gamedev 1u4w5ss 2014-01-01 02:51:13 \n", + "\n", + " LINK_SENTIMENT PROPERTIES \n", + "0 1 345.0,298.0,0.75652173913,0.0173913043478,0.08... \n", + "1 -1 101.0,98.0,0.742574257426,0.019801980198,0.049... \n", + "2 1 85.0,85.0,0.752941176471,0.0235294117647,0.082... \n", + "3 1 1124.0,949.0,0.772241992883,0.0017793594306,0.... \n", + "4 1 715.0,622.0,0.777622377622,0.00699300699301,0.... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('https://snap.stanford.edu/data/soc-redditHyperlinks-body.tsv', sep='\\t')\n", + "df = df[df['TIMESTAMP'] < \"2014-03-01 02:51:13\"]\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "be7b1c5d", + "metadata": {}, + "source": [ + "The `LINK_SENTIMENT` column tells if there is a positive (+1) or negative (-1) relationship from the source subreddit to destination subreddit. We filter out the negative sentiment relationships as they won't add to any meaningful communities. We also drop duplicate relationships." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2f153da1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SOURCE_SUBREDDITTARGET_SUBREDDIT
0leagueoflegendsteamredditteams
2inlandempirebikela
3nflcfb
4playmygamegamedev
5dogemarketdogecoin
\n", + "
" + ], + "text/plain": [ + " SOURCE_SUBREDDIT TARGET_SUBREDDIT\n", + "0 leagueoflegends teamredditteams\n", + "2 inlandempire bikela\n", + "3 nfl cfb\n", + "4 playmygame gamedev\n", + "5 dogemarket dogecoin" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "relationship_df = df[df['LINK_SENTIMENT'] == 1]\n", + "columns = ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']\n", + "relationship_df = relationship_df[columns]\n", + "relationship_df = relationship_df.drop_duplicates()\n", + "relationship_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "59e6a5e9", + "metadata": {}, + "source": [ + "Next, we get a list of all the distinct nodes (source or destination) and load them as a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6266953f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUBREDDIT
0leagueoflegends
1theredlion
2inlandempire
3nfl
4playmygame
\n", + "
" + ], + "text/plain": [ + " SUBREDDIT\n", + "0 leagueoflegends\n", + "1 theredlion\n", + "2 inlandempire\n", + "3 nfl\n", + "4 playmygame" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get unique nodes for each column\n", + "source_nodes = pd.Series(df['SOURCE_SUBREDDIT']).unique()\n", + "target_nodes = pd.Series(df['TARGET_SUBREDDIT']).unique()\n", + "\n", + "# get unique nodes for both columns\n", + "all_nodes = pd.Series(pd.concat([df['SOURCE_SUBREDDIT'], df['TARGET_SUBREDDIT']])).unique()\n", + "\n", + "# create new dataframe with distinct nodes\n", + "nodes_df = pd.DataFrame({'SUBREDDIT': all_nodes})\n", + "nodes_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "c30a4378", + "metadata": {}, + "source": [ + "Finally, we load this data (nodes and edges) into a Graph Database and a GDS graph." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0b18e522", + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.run_cypher(\n", + " \"UNWIND $nodes_list AS node_props CREATE (n:Subreddit {name: node_props.SUBREDDIT})\",\n", + " params = {'nodes_list': nodes_df.to_dict('records')})\n", + "\n", + "gds.run_cypher(\n", + " \"UNWIND $edges_list AS rel_props MATCH (source:Subreddit {name: rel_props.SOURCE_SUBREDDIT}), (target:Subreddit {name: rel_props.TARGET_SUBREDDIT}) CREATE (source)-[:HYPERLINKED_TO {relationship_type: rel_props.relationship_type}]->(target)\", \n", + " params = {'edges_list': relationship_df.to_dict('records')})" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7a3509e8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading: 100%|██████████| 100.0/100 [00:10<00:00, 9.27%/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The projection took 11405 ms\n", + "Graph 'reddit' node count: 3801\n", + "Graph 'reddit' node labels: ['Subreddit']\n" + ] + } + ], + "source": [ + "node_projection = [\"Subreddit\"]\n", + "relationship_projection = {\"HYPERLINKED_TO\": {\"orientation\": \"NATURAL\"}}\n", + "\n", + "G, result = gds.graph.project(\"reddit\", node_projection, relationship_projection) #, nodeProperties = ['node_id', 'node_label']\n", + "\n", + "print(f\"The projection took {result['projectMillis']} ms\")\n", + "\n", + "# We can use convenience methods on `G` to check if the projection looks correct\n", + "print(f\"Graph '{G.name()}' node count: {G.node_count()}\")\n", + "print(f\"Graph '{G.name()}' node labels: {G.node_labels()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "61aa6afe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
degreeDistributiongraphNamedatabasememoryUsagesizeInBytesnodeCountrelationshipCountconfigurationdensitycreationTimemodificationTimeschemaschemaWithOrientation
0{'p99': 15, 'min': 0, 'max': 87, 'mean': 1.631...redditneo4j876 KiB89744838016200{'relationshipProjection': {'HYPERLINKED_TO': ...0.0004292023-05-16T14:07:25.933283995+00:002023-05-16T14:07:27.112253586+00:00{'graphProperties': {}, 'relationships': {'HYP...{'graphProperties': {}, 'relationships': {'HYP...
\n", + "
" + ], + "text/plain": [ + " degreeDistribution graphName database \\\n", + "0 {'p99': 15, 'min': 0, 'max': 87, 'mean': 1.631... reddit neo4j \n", + "\n", + " memoryUsage sizeInBytes nodeCount relationshipCount \\\n", + "0 876 KiB 897448 3801 6200 \n", + "\n", + " configuration density \\\n", + "0 {'relationshipProjection': {'HYPERLINKED_TO': ... 0.000429 \n", + "\n", + " creationTime modificationTime \\\n", + "0 2023-05-16T14:07:25.933283995+00:00 2023-05-16T14:07:27.112253586+00:00 \n", + "\n", + " schema \\\n", + "0 {'graphProperties': {}, 'relationships': {'HYP... \n", + "\n", + " schemaWithOrientation \n", + "0 {'graphProperties': {}, 'relationships': {'HYP... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.graph.list()" + ] + }, + { + "cell_type": "markdown", + "id": "9c259471", + "metadata": {}, + "source": [ + "### Weakly Connected Components\n", + "\n", + "A graph dataset need not always be connected. That is, there may not exist a path from every node to \n", + "every other node in the graph dataset (subgraphs in it may not connected to each other at all). Hence, we \n", + "need to find the total number of nodes in each subgraph to see if it is big enough for further graph analysis. \n", + "Smaller subgraphs or lone nodes will not contribute to the community detection task and should be \n", + "eliminated. Weakly Connected Components is often used as one of the early steps of graph preprocessing.\n", + "\n", + "We use the [Weakly Connected Components](https://neo4j.com/docs/graph-data-science/2.4-preview/algorithms/wcc/) algorithm to find sets of connected nodes and assign each set a component id." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7a114af1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'jobId': 'c7d9036d-b9a5-4d91-8d95-70bccfd67c2d', 'seedProperty': None, 'consecutiveIds': False, 'threshold': 0.0, 'logProgress': True, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'mutateProperty': 'componentId', 'concurrency': 4}\n" + ] + } + ], + "source": [ + "df = gds.wcc.mutate(G, mutateProperty='componentId')\n", + "print(df.configuration)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "04fd557e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Subreddit [componentId]\n", + "dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "G.node_properties()" + ] + }, + { + "cell_type": "markdown", + "id": "23a0a487", + "metadata": {}, + "source": [ + "Next, we will see the size of each connected component and depending on that, we can pick the subgraph that needs further analysis.\n", + "\n", + "We use `run_cypher` here instead of the direct gds client call since we want to see the size of the connected components." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "2274a19a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
componentIdcommunitySizesubreddit
003172[leagueoflegends, nfl, playmygame, dogemarket,...
127820[orangered, orangeredacademy, pasto_range, per...
2238[thedoctorstravels, sirron, aislynisdead, game...
37686[iracing, simracing, redditracing, team_medioc...
48326[perfumeexchange, indiemakeupandmore, asianbea...
............
31437121[aggies]
31537591[brunei]
31637691[descentintotyranny]
31737711[outofthemetaloop]
31837731[pokemonshowdown]
\n", + "

319 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " componentId communitySize \\\n", + "0 0 3172 \n", + "1 278 20 \n", + "2 23 8 \n", + "3 768 6 \n", + "4 832 6 \n", + ".. ... ... \n", + "314 3712 1 \n", + "315 3759 1 \n", + "316 3769 1 \n", + "317 3771 1 \n", + "318 3773 1 \n", + "\n", + " subreddit \n", + "0 [leagueoflegends, nfl, playmygame, dogemarket,... \n", + "1 [orangered, orangeredacademy, pasto_range, per... \n", + "2 [thedoctorstravels, sirron, aislynisdead, game... \n", + "3 [iracing, simracing, redditracing, team_medioc... \n", + "4 [perfumeexchange, indiemakeupandmore, asianbea... \n", + ".. ... \n", + "314 [aggies] \n", + "315 [brunei] \n", + "316 [descentintotyranny] \n", + "317 [outofthemetaloop] \n", + "318 [pokemonshowdown] \n", + "\n", + "[319 rows x 3 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + " CALL gds.graph.nodeProperties.stream('reddit', 'componentId')\n", + " YIELD nodeId, propertyValue\n", + " WITH nodeId as nodeId, gds.util.asNode(nodeId).name AS node, propertyValue AS componentId\n", + " WITH componentId, collect(node) AS subreddit, size(collect(nodeId)) AS communitySize\n", + " RETURN componentId, communitySize, subreddit\n", + " ORDER BY communitySize DESC\n", + "\"\"\"\n", + "\n", + "# query = \"\"\"\n", + "# CALL gds.graph.nodeProperties.stream('reddit', 'componentId')\n", + "# YIELD name, propertyValue\n", + "# WITH name as name, gds.util.asNode(name).name AS name, propertyValue AS componentId\n", + "# WITH componentId, collect(name) AS subreddits, size(collect(name)) AS communitySize\n", + "# RETURN componentId, communitySize, subreddits\n", + "# ORDER BY communitySize DESC\n", + "# \"\"\"\n", + "\n", + "# query = \"\"\"\n", + "# CALL gds.wcc.stream('reddit')\n", + "# YIELD nodeId, componentId\n", + "# RETURN componentId, collect(gds.util.asNode(nodeId).node_id) AS Subreddits, size(collect(gds.util.asNode(nodeId).node_id)) AS Num_subreddits\n", + "# ORDER BY size(Subreddits) DESC\n", + "# \"\"\"\n", + "wcc = gds.run_cypher(query)\n", + "wcc" + ] + }, + { + "cell_type": "markdown", + "id": "9a2355cb", + "metadata": {}, + "source": [ + "We can see that the component with Id 0 has the max number of subreddits = 3172. So we will work only with that subgraph." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d1994b04", + "metadata": {}, + "outputs": [], + "source": [ + "Largest_CC, _ = gds.beta.graph.project.subgraph(\n", + " 'largest_connected_components', \n", + " G,\n", + " 'n.componentId=0', \n", + " '*'\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "f3e613a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Graph({'graphName': 'largest_connected_components', 'nodeCount': 3172, 'relationshipCount': 5858, 'database': 'neo4j', 'configuration': {'relationshipProperties': {}, 'creationTime': neo4j.time.DateTime(2023, 5, 16, 14, 43, 14, 779680794, tzinfo=), 'validateRelationships': False, 'nodeFilter': 'n.componentId=0', 'relationshipFilter': '*', 'nodeProperties': {}, 'concurrency': 4, 'relationshipProjection': {'HYPERLINKED_TO': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'HYPERLINKED_TO', 'properties': {}}}, 'jobId': 'f3f02536-8052-4bc9-b143-ca649e16e0d0', 'nodeProjection': {'Subreddit': {'label': 'Subreddit', 'properties': {}}}, 'logProgress': True, 'readConcurrency': 4, 'sudo': False, 'parameters': {}}, 'schema': {'graphProperties': {}, 'relationships': {'HYPERLINKED_TO': {}}, 'nodes': {'Subreddit': {'componentId': 'Integer (DefaultValue(-9223372036854775808), TRANSIENT)'}}}, 'memoryUsage': '901 KiB'})" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Largest_CC" + ] + }, + { + "cell_type": "markdown", + "id": "17942d04", + "metadata": {}, + "source": [ + "### Community Detection using Louvain\n", + "\n", + "We use the [Louvain](https://neo4j.com/docs/graph-data-science/2.4-preview/algorithms/louvain/) algorithm to detect communities in our subgraph and assign a louvainCommunityId to each community." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "def26464", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Louvain: 100%|██████████| 100.0/100 [00:11<00:00, 8.74%/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "mutateMillis 4\n", + "nodePropertiesWritten 3172\n", + "modularity 0.587643\n", + "modularities [0.4494090889646058, 0.5377130147763601, 0.555...\n", + "ranLevels 10\n", + "communityCount 300\n", + "communityDistribution {'p99': 196, 'min': 1, 'max': 382, 'mean': 10....\n", + "postProcessingMillis 17\n", + "preProcessingMillis 0\n", + "computeMillis 12084\n", + "configuration {'maxIterations': 10, 'seedProperty': None, 'c...\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = gds.louvain.mutate(Largest_CC, mutateProperty='louvainCommunityId')\n", + "df2" + ] + }, + { + "cell_type": "markdown", + "id": "7563e824", + "metadata": {}, + "source": [ + "We get a modularity score of 0.5898 for our community detection algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "858c65be", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "writeMillis 578\n", + "graphName largest_connected_components\n", + "nodeProperties [louvainCommunityId]\n", + "propertiesWritten 3172\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.graph.nodeProperties.write(Largest_CC, [\"louvainCommunityId\"])" + ] + }, + { + "cell_type": "markdown", + "id": "afb104e0", + "metadata": {}, + "source": [ + "We can also check that the property was written by the below command." + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "22f73aea", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Subreddit [componentId, louvainCommunityId]\n", + "dtype: object" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Largest_CC.node_properties()" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "19ccfcb8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communityIdcommunitySizesubreddit
02406382[airsoft, bandnames, connecticut, thehiddenbar...
12516309[posthardcore, metalcore, corejerk, iama, karm...
22654282[locationbot, oldschoolcoolnsfw, uncomfortable...
32676196[playmygame, circlebroke, tribes, conspiratard...
42546185[leagueoflegends, kpop, turntablists, minecraf...
............
29530341[screenshots]
29630391[leangains]
29730401[agnostic]
29830431[mario]
29930451[vegproblems]
\n", + "

300 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " communityId communitySize \\\n", + "0 2406 382 \n", + "1 2516 309 \n", + "2 2654 282 \n", + "3 2676 196 \n", + "4 2546 185 \n", + ".. ... ... \n", + "295 3034 1 \n", + "296 3039 1 \n", + "297 3040 1 \n", + "298 3043 1 \n", + "299 3045 1 \n", + "\n", + " subreddit \n", + "0 [airsoft, bandnames, connecticut, thehiddenbar... \n", + "1 [posthardcore, metalcore, corejerk, iama, karm... \n", + "2 [locationbot, oldschoolcoolnsfw, uncomfortable... \n", + "3 [playmygame, circlebroke, tribes, conspiratard... \n", + "4 [leagueoflegends, kpop, turntablists, minecraf... \n", + ".. ... \n", + "295 [screenshots] \n", + "296 [leangains] \n", + "297 [agnostic] \n", + "298 [mario] \n", + "299 [vegproblems] \n", + "\n", + "[300 rows x 3 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + " CALL gds.graph.nodeProperties.stream('largest_connected_components', 'louvainCommunityId')\n", + " YIELD nodeId, propertyValue\n", + " WITH nodeId as nodeId, gds.util.asNode(nodeId).name AS node, propertyValue AS communityId\n", + " WITH communityId, collect(node) AS subreddit, size(collect(nodeId)) AS communitySize\n", + " RETURN communityId, communitySize, subreddit\n", + " ORDER BY communitySize DESC\n", + "\"\"\"\n", + "\n", + "communities = gds.run_cypher(query)\n", + "communities" + ] + }, + { + "cell_type": "markdown", + "id": "65dcb952", + "metadata": {}, + "source": [ + "### References\n", + "S. Kumar, W.L. Hamilton, J. Leskovec, D. Jurafsky. Community Interaction and Conflict on the Web. World Wide Web Conference, 2018." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37b59b5b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}