From 75ebc66cac94b316fee1ab82c856d42392a47a6d Mon Sep 17 00:00:00 2001 From: Kedar Ghule <41315903+kedarghule@users.noreply.github.com> Date: Mon, 15 May 2023 16:02:34 -0400 Subject: [PATCH 1/4] added community detection jupyter notebook --- examples/community-detection.ipynb | 1138 ++++++++++++++++++++++++++++ 1 file changed, 1138 insertions(+) create mode 100644 examples/community-detection.ipynb diff --git a/examples/community-detection.ipynb b/examples/community-detection.ipynb new file mode 100644 index 000000000..f4068f4ee --- /dev/null +++ b/examples/community-detection.ipynb @@ -0,0 +1,1138 @@ +{ + "cells": [ + { + "attachments": {}, + "cell_type": "markdown", + "id": "f083f11b", + "metadata": {}, + "source": [ + "## Community Detection\n", + "\n", + "The notebook shows the usage of the `graphdatascience` library for community detection on the Reddit Hyperlink Network dataset that can be downloaded [here](https://snap.stanford.edu/data/soc-RedditHyperlinks.html). We will use the `soc-redditHyperlinks-body.tsv` file.\n", + "\n", + "The tasks we cover here include performing initial graph preprocessing using Weakly Connected Components and then performing community detection on the largest component using the Louvain algorithm.\n", + "\n", + "### Setup\n", + "\n", + "We need to import the following libraries:\n", + "- graphdatascience\n", + "- neo4j\n", + "- pandas" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "3953e353", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\kedar\\anaconda3\\envs\\graph_stuff\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from graphdatascience import GraphDataScience\n", + "from neo4j import GraphDatabase\n", + "from neo4j.exceptions import ServiceUnavailable\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "46b33d2d", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "2.3.5\n" + ] + } + ], + "source": [ + "# # Replace with the actual connection URI and credentials\n", + "NEO4J_CONNECTION_URI = \"bolt://XXXXXXXXXXXXX\n", + "NEO4J_USERNAME = \"neo4j\"\n", + "NEO4J_PASSWORD = \"XXXXXXXXXXXXX\"\n", + "\n", + "# Client instantiation\n", + "gds = GraphDataScience(\n", + " NEO4J_CONNECTION_URI,\n", + " auth=(NEO4J_USERNAME, NEO4J_PASSWORD)\n", + ")\n", + "\n", + "print(gds.version())" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "48bd8af1", + "metadata": {}, + "source": [ + "### Importing the dataset\n", + "\n", + "We import the dataset as a pandas dataframe first. We work with only a subset of the dataset. The sampled data is only till 1st March 2014. " + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "a8e677aa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SOURCE_SUBREDDITTARGET_SUBREDDITPOST_IDTIMESTAMPLINK_SENTIMENTPROPERTIES
0leagueoflegendsteamredditteams1u4nrps2013-12-31 16:39:581345.0,298.0,0.75652173913,0.0173913043478,0.08...
1theredlionsoccer1u4qkd2013-12-31 18:18:37-1101.0,98.0,0.742574257426,0.019801980198,0.049...
2inlandempirebikela1u4qlzs2014-01-01 14:54:35185.0,85.0,0.752941176471,0.0235294117647,0.082...
3nflcfb1u4sjvs2013-12-31 17:37:5511124.0,949.0,0.772241992883,0.0017793594306,0....
4playmygamegamedev1u4w5ss2014-01-01 02:51:131715.0,622.0,0.777622377622,0.00699300699301,0....
\n", + "
" + ], + "text/plain": [ + " SOURCE_SUBREDDIT TARGET_SUBREDDIT POST_ID TIMESTAMP \\\n", + "0 leagueoflegends teamredditteams 1u4nrps 2013-12-31 16:39:58 \n", + "1 theredlion soccer 1u4qkd 2013-12-31 18:18:37 \n", + "2 inlandempire bikela 1u4qlzs 2014-01-01 14:54:35 \n", + "3 nfl cfb 1u4sjvs 2013-12-31 17:37:55 \n", + "4 playmygame gamedev 1u4w5ss 2014-01-01 02:51:13 \n", + "\n", + " LINK_SENTIMENT PROPERTIES \n", + "0 1 345.0,298.0,0.75652173913,0.0173913043478,0.08... \n", + "1 -1 101.0,98.0,0.742574257426,0.019801980198,0.049... \n", + "2 1 85.0,85.0,0.752941176471,0.0235294117647,0.082... \n", + "3 1 1124.0,949.0,0.772241992883,0.0017793594306,0.... \n", + "4 1 715.0,622.0,0.777622377622,0.00699300699301,0.... " + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv('soc-redditHyperlinks-body.tsv', sep='\\t')\n", + "df = df[df['TIMESTAMP'] < \"2014-03-01 02:51:13\"]\n", + "df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "be7b1c5d", + "metadata": {}, + "source": [ + "The `LINK_SENTIMENT` column tells if there is a positive (+1) or negative (-1) relationship from the source subreddit to destination subreddit. We filter out the negative sentiment relationships as they won't add to any meaningful communities. We also drop duplicate relationships." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2f153da1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SOURCE_SUBREDDITTARGET_SUBREDDIT
0leagueoflegendsteamredditteams
2inlandempirebikela
3nflcfb
4playmygamegamedev
5dogemarketdogecoin
\n", + "
" + ], + "text/plain": [ + " SOURCE_SUBREDDIT TARGET_SUBREDDIT\n", + "0 leagueoflegends teamredditteams\n", + "2 inlandempire bikela\n", + "3 nfl cfb\n", + "4 playmygame gamedev\n", + "5 dogemarket dogecoin" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "relationship_df = df[df['LINK_SENTIMENT'] == 1]\n", + "columns = ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']\n", + "relationship_df = relationship_df[columns]\n", + "relationship_df = relationship_df.drop_duplicates()\n", + "relationship_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "59e6a5e9", + "metadata": {}, + "source": [ + "Next, we get a list of all the distinct nodes (source or destination) and load them as a dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "6266953f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SUBREDDIT
0leagueoflegends
1theredlion
2inlandempire
3nfl
4playmygame
\n", + "
" + ], + "text/plain": [ + " SUBREDDIT\n", + "0 leagueoflegends\n", + "1 theredlion\n", + "2 inlandempire\n", + "3 nfl\n", + "4 playmygame" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# get unique nodes for each column\n", + "source_nodes = pd.Series(df['SOURCE_SUBREDDIT']).unique()\n", + "target_nodes = pd.Series(df['TARGET_SUBREDDIT']).unique()\n", + "\n", + "# get unique nodes for both columns\n", + "all_nodes = pd.Series(pd.concat([df['SOURCE_SUBREDDIT'], df['TARGET_SUBREDDIT']])).unique()\n", + "\n", + "# create new dataframe with distinct nodes\n", + "nodes_df = pd.DataFrame({'SUBREDDIT': all_nodes})\n", + "nodes_df.head()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "c30a4378", + "metadata": {}, + "source": [ + "Finally, we load this data (nodes and edges) into a Graph Database and a GDS graph." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "0b18e522", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "driver = GraphDatabase.driver(NEO4J_CONNECTION_URI,\n", + " auth=(NEO4J_USERNAME, NEO4J_PASSWORD))\n", + "\n", + "# Create nodes and relationships in the graph using UNWIND\n", + "with driver.session() as session:\n", + " # Create nodes using UNWIND\n", + " nodes_list = nodes_df.to_dict('records')\n", + " session.run(\"UNWIND $nodes_list AS node_props CREATE (n:Subreddit {node_id: node_props.SUBREDDIT, node_label: node_props.SUBREDDIT})\", nodes_list=nodes_list)\n", + "\n", + " # Create relationships using UNWIND\n", + " edges_list = relationship_df.to_dict('records')\n", + " session.run(\"UNWIND $edges_list AS rel_props MATCH (source:Subreddit {node_id: rel_props.SOURCE_SUBREDDIT}), (target:Subreddit {node_id: rel_props.TARGET_SUBREDDIT}) CREATE (source)-[:HYPERLINKED_TO {relationship_type: rel_props.relationship_type}]->(target)\", edges_list=edges_list)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "7a3509e8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Loading: 100%|██████████| 100.0/100 [00:09<00:00, 11.02%/s] \n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The projection took 9289 ms\n", + "Graph 'reddit' node count: 3801\n", + "Graph 'reddit' node labels: ['Subreddit']\n" + ] + } + ], + "source": [ + "node_projection = [\"Subreddit\"]\n", + "relationship_projection = {\"HYPERLINKED_TO\": {\"orientation\": \"NATURAL\"}}\n", + "\n", + "G, result = gds.graph.project(\"reddit\", node_projection, relationship_projection) #, nodeProperties = ['node_id', 'node_label']\n", + "\n", + "print(f\"The projection took {result['projectMillis']} ms\")\n", + "\n", + "# We can use convenience methods on `G` to check if the projection looks correct\n", + "print(f\"Graph '{G.name()}' node count: {G.node_count()}\")\n", + "print(f\"Graph '{G.name()}' node labels: {G.node_labels()}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "61aa6afe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
degreeDistributiongraphNamedatabasememoryUsagesizeInBytesnodeCountrelationshipCountconfigurationdensitycreationTimemodificationTimeschemaschemaWithOrientation
0{'p99': 15, 'min': 0, 'max': 87, 'mean': 1.631...redditneo4j876 KiB89744838016200{'relationshipProjection': {'HYPERLINKED_TO': ...0.0004292023-05-14T15:29:30.028518203+00:002023-05-14T15:29:30.931965175+00:00{'graphProperties': {}, 'relationships': {'HYP...{'graphProperties': {}, 'relationships': {'HYP...
\n", + "
" + ], + "text/plain": [ + " degreeDistribution graphName database \\\n", + "0 {'p99': 15, 'min': 0, 'max': 87, 'mean': 1.631... reddit neo4j \n", + "\n", + " memoryUsage sizeInBytes nodeCount relationshipCount \\\n", + "0 876 KiB 897448 3801 6200 \n", + "\n", + " configuration density \\\n", + "0 {'relationshipProjection': {'HYPERLINKED_TO': ... 0.000429 \n", + "\n", + " creationTime modificationTime \\\n", + "0 2023-05-14T15:29:30.028518203+00:00 2023-05-14T15:29:30.931965175+00:00 \n", + "\n", + " schema \\\n", + "0 {'graphProperties': {}, 'relationships': {'HYP... \n", + "\n", + " schemaWithOrientation \n", + "0 {'graphProperties': {}, 'relationships': {'HYP... " + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gds.graph.list()" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9c259471", + "metadata": {}, + "source": [ + "### Weakly Connected Components\n", + "\n", + "A graph dataset need not always be connected. That is, there may not exist a path from every node to \n", + "every other node in the graph dataset (subgraphs in it may not connected to each other at all). Hence, we \n", + "need to find the total number of nodes in each subgraph to see if it is big enough for further graph analysis. \n", + "Smaller subgraphs or lone nodes will not contribute to the community detection task and should be \n", + "eliminated. Weakly Connected Components is often used as one of the early steps of graph preprocessing.\n", + "\n", + "We use the [Weakly Connected Components](https://neo4j.com/docs/graph-data-science/2.4-preview/algorithms/wcc/) algorithm to find sets of connected nodes and assign each set a component id." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7a114af1", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'jobId': 'b69faaaa-a267-444c-82b2-d11c66f9a6a4', 'seedProperty': None, 'consecutiveIds': False, 'threshold': 0.0, 'logProgress': True, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'mutateProperty': 'componentId', 'concurrency': 4}\n" + ] + } + ], + "source": [ + "df = gds.wcc.mutate(G, mutateProperty='componentId')\n", + "print(df.configuration)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "04fd557e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Subreddit [componentId]\n", + "dtype: object" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "G.node_properties()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2274a19a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
componentIdSubredditsNum_subreddits
00[leagueoflegends, nfl, playmygame, dogemarket,...3172
1278[orangered, orangeredacademy, pasto_range, per...20
223[thedoctorstravels, sirron, aislynisdead, game...8
3768[iracing, simracing, redditracing, team_medioc...6
4832[perfumeexchange, indiemakeupandmore, asianbea...6
............
3143712[aggies]1
3153759[brunei]1
3163769[descentintotyranny]1
3173771[outofthemetaloop]1
3183773[pokemonshowdown]1
\n", + "

319 rows × 3 columns

\n", + "
" + ], + "text/plain": [ + " componentId Subreddits \\\n", + "0 0 [leagueoflegends, nfl, playmygame, dogemarket,... \n", + "1 278 [orangered, orangeredacademy, pasto_range, per... \n", + "2 23 [thedoctorstravels, sirron, aislynisdead, game... \n", + "3 768 [iracing, simracing, redditracing, team_medioc... \n", + "4 832 [perfumeexchange, indiemakeupandmore, asianbea... \n", + ".. ... ... \n", + "314 3712 [aggies] \n", + "315 3759 [brunei] \n", + "316 3769 [descentintotyranny] \n", + "317 3771 [outofthemetaloop] \n", + "318 3773 [pokemonshowdown] \n", + "\n", + " Num_subreddits \n", + "0 3172 \n", + "1 20 \n", + "2 8 \n", + "3 6 \n", + "4 6 \n", + ".. ... \n", + "314 1 \n", + "315 1 \n", + "316 1 \n", + "317 1 \n", + "318 1 \n", + "\n", + "[319 rows x 3 columns]" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + " CALL gds.wcc.stream('reddit')\n", + " YIELD nodeId, componentId\n", + " RETURN componentId, collect(gds.util.asNode(nodeId).node_id) AS Subreddits, size(collect(gds.util.asNode(nodeId).node_id)) AS Num_subreddits\n", + " ORDER BY size(Subreddits) DESC\n", + "\"\"\"\n", + "wcc = gds.run_cypher(query)\n", + "wcc" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "9a2355cb", + "metadata": {}, + "source": [ + "We can see that the component with Id 0 has the max number of subreddits = 3172. So we will work only with that subgraph." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "d1994b04", + "metadata": {}, + "outputs": [], + "source": [ + "Largest_CC, _ = gds.beta.graph.project.subgraph(\n", + " 'largest_connected_components2', \n", + " G,\n", + " 'n.componentId=0', \n", + " '*'\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "f3e613a1", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Graph({'graphName': 'largest_connected_components2', 'nodeCount': 3172, 'relationshipCount': 5858, 'database': 'neo4j', 'configuration': {'relationshipProperties': {}, 'creationTime': neo4j.time.DateTime(2023, 5, 14, 15, 29, 52, 126057108, tzinfo=), 'validateRelationships': False, 'nodeFilter': 'n.componentId=0', 'relationshipFilter': '*', 'nodeProperties': {}, 'concurrency': 4, 'relationshipProjection': {'HYPERLINKED_TO': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'HYPERLINKED_TO', 'properties': {}}}, 'jobId': 'e1d3750e-61f5-4928-b16c-4f5f566e09f1', 'nodeProjection': {'Subreddit': {'label': 'Subreddit', 'properties': {}}}, 'logProgress': True, 'readConcurrency': 4, 'sudo': False, 'parameters': {}}, 'schema': {'graphProperties': {}, 'relationships': {'HYPERLINKED_TO': {}}, 'nodes': {'Subreddit': {'componentId': 'Integer (DefaultValue(-9223372036854775808), TRANSIENT)'}}}, 'memoryUsage': '901 KiB'})" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Largest_CC" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "17942d04", + "metadata": {}, + "source": [ + "### Community Detection using Louvain\n", + "\n", + "We use the [Louvain](https://neo4j.com/docs/graph-data-science/2.4-preview/algorithms/louvain/) algorithm to detect communities in our subgraph and assign a louvainCommunityId to each community." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "def26464", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Louvain: 100%|██████████| 100.0/100 [00:12<00:00, 7.95%/s]\n" + ] + }, + { + "data": { + "text/plain": [ + "mutateMillis 0\n", + "nodePropertiesWritten 3172\n", + "modularity 0.58988\n", + "modularities [0.4494089141198883, 0.5373675216145954, 0.555...\n", + "ranLevels 10\n", + "communityCount 300\n", + "communityDistribution {'p99': 196, 'min': 1, 'max': 382, 'mean': 10....\n", + "postProcessingMillis 22\n", + "preProcessingMillis 1\n", + "computeMillis 12974\n", + "configuration {'maxIterations': 10, 'seedProperty': None, 'c...\n", + "Name: 0, dtype: object" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2 = gds.louvain.mutate(Largest_CC, mutateProperty='louvainCommunityId')\n", + "df2" + ] + }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "7563e824", + "metadata": {}, + "source": [ + "We get a modularity score of 0.5898 for our community detection algorithm." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "46969ec5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.5898798012505129" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2.modularity" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "15fc2baa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Subreddit [componentId, louvainCommunityId]\n", + "dtype: object" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Largest_CC.node_properties()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "858c65be", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
communityCountmodularitymodularities
03000.58988[0.4494089141198883, 0.5373675216145954, 0.555...
\n", + "
" + ], + "text/plain": [ + " communityCount modularity \\\n", + "0 300 0.58988 \n", + "\n", + " modularities \n", + "0 [0.4494089141198883, 0.5373675216145954, 0.555... " + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + " CALL gds.louvain.write('largest_connected_components2', { writeProperty: 'louvainCommunityId' })\n", + " YIELD communityCount, modularity, modularities\n", + "\"\"\"\n", + "communities = gds.run_cypher(query)\n", + "communities" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "22f73aea", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Subreddit [componentId, louvainCommunityId]\n", + "dtype: object" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "Largest_CC.node_properties()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "19ccfcb8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SubredditscommunityId
0[airsoft, bandnames, connecticut, thehiddenbar...2406
1[posthardcore, metalcore, corejerk, iama, karm...2612
2[locationbot, oldschoolcoolnsfw, uncomfortable...2579
3[playmygame, circlebroke, tribes, conspiratard...2676
4[radioreddit, autism, modhelp, digital_immorta...3158
.........
295[banishedmaps]3032
296[screenshots]3034
297[leangains]3039
298[agnostic]3040
299[mario]3043
\n", + "

300 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " Subreddits communityId\n", + "0 [airsoft, bandnames, connecticut, thehiddenbar... 2406\n", + "1 [posthardcore, metalcore, corejerk, iama, karm... 2612\n", + "2 [locationbot, oldschoolcoolnsfw, uncomfortable... 2579\n", + "3 [playmygame, circlebroke, tribes, conspiratard... 2676\n", + "4 [radioreddit, autism, modhelp, digital_immorta... 3158\n", + ".. ... ...\n", + "295 [banishedmaps] 3032\n", + "296 [screenshots] 3034\n", + "297 [leangains] 3039\n", + "298 [agnostic] 3040\n", + "299 [mario] 3043\n", + "\n", + "[300 rows x 2 columns]" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "query = \"\"\"\n", + " CALL gds.louvain.stream('largest_connected_components2')\n", + " YIELD nodeId, communityId, intermediateCommunityIds\n", + " RETURN collect(gds.util.asNode(nodeId).node_id) AS Subreddits, communityId\n", + " ORDER BY size(Subreddits) DESC\n", + "\"\"\"\n", + "wcc = gds.run_cypher(query)\n", + "wcc" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37b59b5b", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From bf70d3d9093c83f9ae839c928ab3c5592741ea5f Mon Sep 17 00:00:00 2001 From: Kedar Ghule <41315903+kedarghule@users.noreply.github.com> Date: Mon, 15 May 2023 16:07:53 -0400 Subject: [PATCH 2/4] added reference to dataset --- examples/community-detection.ipynb | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/examples/community-detection.ipynb b/examples/community-detection.ipynb index f4068f4ee..9f55fc594 100644 --- a/examples/community-detection.ipynb +++ b/examples/community-detection.ipynb @@ -1105,6 +1105,16 @@ "wcc" ] }, + { + "attachments": {}, + "cell_type": "markdown", + "id": "65dcb952", + "metadata": {}, + "source": [ + "### References\n", + "S. Kumar, W.L. Hamilton, J. Leskovec, D. Jurafsky. Community Interaction and Conflict on the Web. World Wide Web Conference, 2018." + ] + }, { "cell_type": "code", "execution_count": null, From 78e4b4522bae17c1b36e2d54269b57f11cbbffd8 Mon Sep 17 00:00:00 2001 From: Kedar Ghule <41315903+kedarghule@users.noreply.github.com> Date: Tue, 16 May 2023 11:39:20 -0400 Subject: [PATCH 3/4] Made changes as per comments --- examples/community-detection.ipynb | 475 +++++++++++++++-------------- 1 file changed, 249 insertions(+), 226 deletions(-) diff --git a/examples/community-detection.ipynb b/examples/community-detection.ipynb index 9f55fc594..bc3aa1b29 100644 --- a/examples/community-detection.ipynb +++ b/examples/community-detection.ipynb @@ -1,12 +1,29 @@ { "cells": [ { - "attachments": {}, + "cell_type": "markdown", + "id": "5b60d8ba", + "metadata": {}, + "source": [ + "# Community Detection" + ] + }, + { + "cell_type": "markdown", + "id": "3e2fb927", + "metadata": {}, + "source": [ + "\n", + " \"Open\n", + "" + ] + }, + { "cell_type": "markdown", "id": "f083f11b", "metadata": {}, "source": [ - "## Community Detection\n", + "This Jupyter notebook is hosted [here](https://github.com/neo4j/graph-data-science-client/blob/main/examples/community-detection.ipynb) in the Neo4j Graph Data Science Client Github repository.\n", "\n", "The notebook shows the usage of the `graphdatascience` library for community detection on the Reddit Hyperlink Network dataset that can be downloaded [here](https://snap.stanford.edu/data/soc-RedditHyperlinks.html). We will use the `soc-redditHyperlinks-body.tsv` file.\n", "\n", @@ -47,20 +64,12 @@ "execution_count": 2, "id": "46b33d2d", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "2.3.5\n" - ] - } - ], + "outputs": [], "source": [ "# # Replace with the actual connection URI and credentials\n", - "NEO4J_CONNECTION_URI = \"bolt://XXXXXXXXXXXXX\n", + "NEO4J_CONNECTION_URI = \"bolt://54.152.132.224:7687\"\n", "NEO4J_USERNAME = \"neo4j\"\n", - "NEO4J_PASSWORD = \"XXXXXXXXXXXXX\"\n", + "NEO4J_PASSWORD = \"scissors-hoists-tastes\"\n", "\n", "# Client instantiation\n", "gds = GraphDataScience(\n", @@ -68,11 +77,18 @@ " auth=(NEO4J_USERNAME, NEO4J_PASSWORD)\n", ")\n", "\n", - "print(gds.version())" + "# NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n", + "# NEO4J_AUTH = None\n", + "# if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n", + "# NEO4J_AUTH = (\n", + "# os.environ.get(\"NEO4J_USER\"),\n", + "# os.environ.get(\"NEO4J_PASSWORD\"),\n", + "# )\n", + "\n", + "# gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "48bd8af1", "metadata": {}, @@ -189,13 +205,12 @@ } ], "source": [ - "df = pd.read_csv('soc-redditHyperlinks-body.tsv', sep='\\t')\n", + "df = pd.read_csv('https://snap.stanford.edu/data/soc-redditHyperlinks-body.tsv', sep='\\t')\n", "df = df[df['TIMESTAMP'] < \"2014-03-01 02:51:13\"]\n", "df.head()" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "be7b1c5d", "metadata": {}, @@ -287,7 +302,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "59e6a5e9", "metadata": {}, @@ -378,7 +392,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "c30a4378", "metadata": {}, @@ -393,20 +406,54 @@ "metadata": { "scrolled": true }, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: []\n", + "Index: []" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "driver = GraphDatabase.driver(NEO4J_CONNECTION_URI,\n", - " auth=(NEO4J_USERNAME, NEO4J_PASSWORD))\n", - "\n", - "# Create nodes and relationships in the graph using UNWIND\n", - "with driver.session() as session:\n", - " # Create nodes using UNWIND\n", - " nodes_list = nodes_df.to_dict('records')\n", - " session.run(\"UNWIND $nodes_list AS node_props CREATE (n:Subreddit {node_id: node_props.SUBREDDIT, node_label: node_props.SUBREDDIT})\", nodes_list=nodes_list)\n", + "gds.run_cypher(\n", + " \"UNWIND $nodes_list AS node_props CREATE (n:Subreddit {name: node_props.SUBREDDIT})\",\n", + " params = {'nodes_list': nodes_df.to_dict('records')})\n", "\n", - " # Create relationships using UNWIND\n", - " edges_list = relationship_df.to_dict('records')\n", - " session.run(\"UNWIND $edges_list AS rel_props MATCH (source:Subreddit {node_id: rel_props.SOURCE_SUBREDDIT}), (target:Subreddit {node_id: rel_props.TARGET_SUBREDDIT}) CREATE (source)-[:HYPERLINKED_TO {relationship_type: rel_props.relationship_type}]->(target)\", edges_list=edges_list)" + "gds.run_cypher(\n", + " \"UNWIND $edges_list AS rel_props MATCH (source:Subreddit {name: rel_props.SOURCE_SUBREDDIT}), (target:Subreddit {name: rel_props.TARGET_SUBREDDIT}) CREATE (source)-[:HYPERLINKED_TO {relationship_type: rel_props.relationship_type}]->(target)\", \n", + " params = {'edges_list': relationship_df.to_dict('records')})" ] }, { @@ -419,14 +466,14 @@ "name": "stderr", "output_type": "stream", "text": [ - "Loading: 100%|██████████| 100.0/100 [00:09<00:00, 11.02%/s] \n" + "Loading: 100%|██████████| 100.0/100 [00:10<00:00, 9.27%/s] \n" ] }, { "name": "stdout", "output_type": "stream", "text": [ - "The projection took 9289 ms\n", + "The projection took 11405 ms\n", "Graph 'reddit' node count: 3801\n", "Graph 'reddit' node labels: ['Subreddit']\n" ] @@ -499,8 +546,8 @@ " 6200\n", " {'relationshipProjection': {'HYPERLINKED_TO': ...\n", " 0.000429\n", - " 2023-05-14T15:29:30.028518203+00:00\n", - " 2023-05-14T15:29:30.931965175+00:00\n", + " 2023-05-16T14:07:25.933283995+00:00\n", + " 2023-05-16T14:07:27.112253586+00:00\n", " {'graphProperties': {}, 'relationships': {'HYP...\n", " {'graphProperties': {}, 'relationships': {'HYP...\n", " \n", @@ -519,7 +566,7 @@ "0 {'relationshipProjection': {'HYPERLINKED_TO': ... 0.000429 \n", "\n", " creationTime modificationTime \\\n", - "0 2023-05-14T15:29:30.028518203+00:00 2023-05-14T15:29:30.931965175+00:00 \n", + "0 2023-05-16T14:07:25.933283995+00:00 2023-05-16T14:07:27.112253586+00:00 \n", "\n", " schema \\\n", "0 {'graphProperties': {}, 'relationships': {'HYP... \n", @@ -538,7 +585,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "9c259471", "metadata": {}, @@ -564,7 +610,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "{'jobId': 'b69faaaa-a267-444c-82b2-d11c66f9a6a4', 'seedProperty': None, 'consecutiveIds': False, 'threshold': 0.0, 'logProgress': True, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'mutateProperty': 'componentId', 'concurrency': 4}\n" + "{'jobId': 'c7d9036d-b9a5-4d91-8d95-70bccfd67c2d', 'seedProperty': None, 'consecutiveIds': False, 'threshold': 0.0, 'logProgress': True, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'mutateProperty': 'componentId', 'concurrency': 4}\n" ] } ], @@ -595,9 +641,19 @@ "G.node_properties()" ] }, + { + "cell_type": "markdown", + "id": "23a0a487", + "metadata": {}, + "source": [ + "Next, we will see the size of each connected component and depending on that, we can pick the subgraph that needs further analysis.\n", + "\n", + "We use `run_cypher` here instead of the direct gds client call since we want to see the size of the connected components." + ] + }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 14, "id": "2274a19a", "metadata": {}, "outputs": [ @@ -623,40 +679,40 @@ " \n", " \n", " componentId\n", - " Subreddits\n", - " Num_subreddits\n", + " communitySize\n", + " subreddit\n", " \n", " \n", " \n", " \n", " 0\n", " 0\n", - " [leagueoflegends, nfl, playmygame, dogemarket,...\n", " 3172\n", + " [leagueoflegends, nfl, playmygame, dogemarket,...\n", " \n", " \n", " 1\n", " 278\n", - " [orangered, orangeredacademy, pasto_range, per...\n", " 20\n", + " [orangered, orangeredacademy, pasto_range, per...\n", " \n", " \n", " 2\n", " 23\n", - " [thedoctorstravels, sirron, aislynisdead, game...\n", " 8\n", + " [thedoctorstravels, sirron, aislynisdead, game...\n", " \n", " \n", " 3\n", " 768\n", - " [iracing, simracing, redditracing, team_medioc...\n", " 6\n", + " [iracing, simracing, redditracing, team_medioc...\n", " \n", " \n", " 4\n", " 832\n", - " [perfumeexchange, indiemakeupandmore, asianbea...\n", " 6\n", + " [perfumeexchange, indiemakeupandmore, asianbea...\n", " \n", " \n", " ...\n", @@ -667,32 +723,32 @@ " \n", " 314\n", " 3712\n", - " [aggies]\n", " 1\n", + " [aggies]\n", " \n", " \n", " 315\n", " 3759\n", - " [brunei]\n", " 1\n", + " [brunei]\n", " \n", " \n", " 316\n", " 3769\n", - " [descentintotyranny]\n", " 1\n", + " [descentintotyranny]\n", " \n", " \n", " 317\n", " 3771\n", - " [outofthemetaloop]\n", " 1\n", + " [outofthemetaloop]\n", " \n", " \n", " 318\n", " 3773\n", - " [pokemonshowdown]\n", " 1\n", + " [pokemonshowdown]\n", " \n", " \n", "\n", @@ -700,53 +756,70 @@ "" ], "text/plain": [ - " componentId Subreddits \\\n", - "0 0 [leagueoflegends, nfl, playmygame, dogemarket,... \n", - "1 278 [orangered, orangeredacademy, pasto_range, per... \n", - "2 23 [thedoctorstravels, sirron, aislynisdead, game... \n", - "3 768 [iracing, simracing, redditracing, team_medioc... \n", - "4 832 [perfumeexchange, indiemakeupandmore, asianbea... \n", - ".. ... ... \n", - "314 3712 [aggies] \n", - "315 3759 [brunei] \n", - "316 3769 [descentintotyranny] \n", - "317 3771 [outofthemetaloop] \n", - "318 3773 [pokemonshowdown] \n", + " componentId communitySize \\\n", + "0 0 3172 \n", + "1 278 20 \n", + "2 23 8 \n", + "3 768 6 \n", + "4 832 6 \n", + ".. ... ... \n", + "314 3712 1 \n", + "315 3759 1 \n", + "316 3769 1 \n", + "317 3771 1 \n", + "318 3773 1 \n", "\n", - " Num_subreddits \n", - "0 3172 \n", - "1 20 \n", - "2 8 \n", - "3 6 \n", - "4 6 \n", - ".. ... \n", - "314 1 \n", - "315 1 \n", - "316 1 \n", - "317 1 \n", - "318 1 \n", + " subreddit \n", + "0 [leagueoflegends, nfl, playmygame, dogemarket,... \n", + "1 [orangered, orangeredacademy, pasto_range, per... \n", + "2 [thedoctorstravels, sirron, aislynisdead, game... \n", + "3 [iracing, simracing, redditracing, team_medioc... \n", + "4 [perfumeexchange, indiemakeupandmore, asianbea... \n", + ".. ... \n", + "314 [aggies] \n", + "315 [brunei] \n", + "316 [descentintotyranny] \n", + "317 [outofthemetaloop] \n", + "318 [pokemonshowdown] \n", "\n", "[319 rows x 3 columns]" ] }, - "execution_count": 11, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", - " CALL gds.wcc.stream('reddit')\n", - " YIELD nodeId, componentId\n", - " RETURN componentId, collect(gds.util.asNode(nodeId).node_id) AS Subreddits, size(collect(gds.util.asNode(nodeId).node_id)) AS Num_subreddits\n", - " ORDER BY size(Subreddits) DESC\n", + " CALL gds.graph.nodeProperties.stream('reddit', 'componentId')\n", + " YIELD nodeId, propertyValue\n", + " WITH nodeId as nodeId, gds.util.asNode(nodeId).name AS node, propertyValue AS componentId\n", + " WITH componentId, collect(node) AS subreddit, size(collect(nodeId)) AS communitySize\n", + " RETURN componentId, communitySize, subreddit\n", + " ORDER BY communitySize DESC\n", "\"\"\"\n", + "\n", + "# query = \"\"\"\n", + "# CALL gds.graph.nodeProperties.stream('reddit', 'componentId')\n", + "# YIELD name, propertyValue\n", + "# WITH name as name, gds.util.asNode(name).name AS name, propertyValue AS componentId\n", + "# WITH componentId, collect(name) AS subreddits, size(collect(name)) AS communitySize\n", + "# RETURN componentId, communitySize, subreddits\n", + "# ORDER BY communitySize DESC\n", + "# \"\"\"\n", + "\n", + "# query = \"\"\"\n", + "# CALL gds.wcc.stream('reddit')\n", + "# YIELD nodeId, componentId\n", + "# RETURN componentId, collect(gds.util.asNode(nodeId).node_id) AS Subreddits, size(collect(gds.util.asNode(nodeId).node_id)) AS Num_subreddits\n", + "# ORDER BY size(Subreddits) DESC\n", + "# \"\"\"\n", "wcc = gds.run_cypher(query)\n", "wcc" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "9a2355cb", "metadata": {}, @@ -756,13 +829,13 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 17, "id": "d1994b04", "metadata": {}, "outputs": [], "source": [ "Largest_CC, _ = gds.beta.graph.project.subgraph(\n", - " 'largest_connected_components2', \n", + " 'largest_connected_components', \n", " G,\n", " 'n.componentId=0', \n", " '*'\n", @@ -771,17 +844,17 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 18, "id": "f3e613a1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Graph({'graphName': 'largest_connected_components2', 'nodeCount': 3172, 'relationshipCount': 5858, 'database': 'neo4j', 'configuration': {'relationshipProperties': {}, 'creationTime': neo4j.time.DateTime(2023, 5, 14, 15, 29, 52, 126057108, tzinfo=), 'validateRelationships': False, 'nodeFilter': 'n.componentId=0', 'relationshipFilter': '*', 'nodeProperties': {}, 'concurrency': 4, 'relationshipProjection': {'HYPERLINKED_TO': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'HYPERLINKED_TO', 'properties': {}}}, 'jobId': 'e1d3750e-61f5-4928-b16c-4f5f566e09f1', 'nodeProjection': {'Subreddit': {'label': 'Subreddit', 'properties': {}}}, 'logProgress': True, 'readConcurrency': 4, 'sudo': False, 'parameters': {}}, 'schema': {'graphProperties': {}, 'relationships': {'HYPERLINKED_TO': {}}, 'nodes': {'Subreddit': {'componentId': 'Integer (DefaultValue(-9223372036854775808), TRANSIENT)'}}}, 'memoryUsage': '901 KiB'})" + "Graph({'graphName': 'largest_connected_components', 'nodeCount': 3172, 'relationshipCount': 5858, 'database': 'neo4j', 'configuration': {'relationshipProperties': {}, 'creationTime': neo4j.time.DateTime(2023, 5, 16, 14, 43, 14, 779680794, tzinfo=), 'validateRelationships': False, 'nodeFilter': 'n.componentId=0', 'relationshipFilter': '*', 'nodeProperties': {}, 'concurrency': 4, 'relationshipProjection': {'HYPERLINKED_TO': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'HYPERLINKED_TO', 'properties': {}}}, 'jobId': 'f3f02536-8052-4bc9-b143-ca649e16e0d0', 'nodeProjection': {'Subreddit': {'label': 'Subreddit', 'properties': {}}}, 'logProgress': True, 'readConcurrency': 4, 'sudo': False, 'parameters': {}}, 'schema': {'graphProperties': {}, 'relationships': {'HYPERLINKED_TO': {}}, 'nodes': {'Subreddit': {'componentId': 'Integer (DefaultValue(-9223372036854775808), TRANSIENT)'}}}, 'memoryUsage': '901 KiB'})" ] }, - "execution_count": 13, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -791,7 +864,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "17942d04", "metadata": {}, @@ -803,7 +875,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 19, "id": "def26464", "metadata": {}, "outputs": [ @@ -811,27 +883,27 @@ "name": "stderr", "output_type": "stream", "text": [ - "Louvain: 100%|██████████| 100.0/100 [00:12<00:00, 7.95%/s]\n" + "Louvain: 100%|██████████| 100.0/100 [00:11<00:00, 8.74%/s]\n" ] }, { "data": { "text/plain": [ - "mutateMillis 0\n", + "mutateMillis 4\n", "nodePropertiesWritten 3172\n", - "modularity 0.58988\n", - "modularities [0.4494089141198883, 0.5373675216145954, 0.555...\n", + "modularity 0.587643\n", + "modularities [0.4494090889646058, 0.5377130147763601, 0.555...\n", "ranLevels 10\n", "communityCount 300\n", "communityDistribution {'p99': 196, 'min': 1, 'max': 382, 'mean': 10....\n", - "postProcessingMillis 22\n", - "preProcessingMillis 1\n", - "computeMillis 12974\n", + "postProcessingMillis 17\n", + "preProcessingMillis 0\n", + "computeMillis 12084\n", "configuration {'maxIterations': 10, 'seedProperty': None, 'c...\n", "Name: 0, dtype: object" ] }, - "execution_count": 14, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -842,7 +914,6 @@ ] }, { - "attachments": {}, "cell_type": "markdown", "id": "7563e824", "metadata": {}, @@ -852,115 +923,40 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "46969ec5", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0.5898798012505129" - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df2.modularity" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "15fc2baa", + "execution_count": 20, + "id": "858c65be", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Subreddit [componentId, louvainCommunityId]\n", - "dtype: object" + "writeMillis 578\n", + "graphName largest_connected_components\n", + "nodeProperties [louvainCommunityId]\n", + "propertiesWritten 3172\n", + "Name: 0, dtype: object" ] }, - "execution_count": 16, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "Largest_CC.node_properties()" + "gds.graph.nodeProperties.write(Largest_CC, [\"louvainCommunityId\"])" ] }, { - "cell_type": "code", - "execution_count": 17, - "id": "858c65be", + "cell_type": "markdown", + "id": "afb104e0", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
communityCountmodularitymodularities
03000.58988[0.4494089141198883, 0.5373675216145954, 0.555...
\n", - "
" - ], - "text/plain": [ - " communityCount modularity \\\n", - "0 300 0.58988 \n", - "\n", - " modularities \n", - "0 [0.4494089141198883, 0.5373675216145954, 0.555... " - ] - }, - "execution_count": 17, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "query = \"\"\"\n", - " CALL gds.louvain.write('largest_connected_components2', { writeProperty: 'louvainCommunityId' })\n", - " YIELD communityCount, modularity, modularities\n", - "\"\"\"\n", - "communities = gds.run_cypher(query)\n", - "communities" + "We can also check that the property was written by the below command." ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 21, "id": "22f73aea", "metadata": {}, "outputs": [ @@ -971,7 +967,7 @@ "dtype: object" ] }, - "execution_count": 18, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -982,7 +978,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 23, "id": "19ccfcb8", "metadata": {}, "outputs": [ @@ -1007,106 +1003,133 @@ " \n", " \n", " \n", - " Subreddits\n", " communityId\n", + " communitySize\n", + " subreddit\n", " \n", " \n", " \n", " \n", " 0\n", - " [airsoft, bandnames, connecticut, thehiddenbar...\n", " 2406\n", + " 382\n", + " [airsoft, bandnames, connecticut, thehiddenbar...\n", " \n", " \n", " 1\n", + " 2516\n", + " 309\n", " [posthardcore, metalcore, corejerk, iama, karm...\n", - " 2612\n", " \n", " \n", " 2\n", + " 2654\n", + " 282\n", " [locationbot, oldschoolcoolnsfw, uncomfortable...\n", - " 2579\n", " \n", " \n", " 3\n", - " [playmygame, circlebroke, tribes, conspiratard...\n", " 2676\n", + " 196\n", + " [playmygame, circlebroke, tribes, conspiratard...\n", " \n", " \n", " 4\n", - " [radioreddit, autism, modhelp, digital_immorta...\n", - " 3158\n", + " 2546\n", + " 185\n", + " [leagueoflegends, kpop, turntablists, minecraf...\n", " \n", " \n", " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 295\n", - " [banishedmaps]\n", - " 3032\n", + " 3034\n", + " 1\n", + " [screenshots]\n", " \n", " \n", " 296\n", - " [screenshots]\n", - " 3034\n", + " 3039\n", + " 1\n", + " [leangains]\n", " \n", " \n", " 297\n", - " [leangains]\n", - " 3039\n", + " 3040\n", + " 1\n", + " [agnostic]\n", " \n", " \n", " 298\n", - " [agnostic]\n", - " 3040\n", + " 3043\n", + " 1\n", + " [mario]\n", " \n", " \n", " 299\n", - " [mario]\n", - " 3043\n", + " 3045\n", + " 1\n", + " [vegproblems]\n", " \n", " \n", "\n", - "

300 rows × 2 columns

\n", + "

300 rows × 3 columns

\n", "" ], "text/plain": [ - " Subreddits communityId\n", - "0 [airsoft, bandnames, connecticut, thehiddenbar... 2406\n", - "1 [posthardcore, metalcore, corejerk, iama, karm... 2612\n", - "2 [locationbot, oldschoolcoolnsfw, uncomfortable... 2579\n", - "3 [playmygame, circlebroke, tribes, conspiratard... 2676\n", - "4 [radioreddit, autism, modhelp, digital_immorta... 3158\n", - ".. ... ...\n", - "295 [banishedmaps] 3032\n", - "296 [screenshots] 3034\n", - "297 [leangains] 3039\n", - "298 [agnostic] 3040\n", - "299 [mario] 3043\n", + " communityId communitySize \\\n", + "0 2406 382 \n", + "1 2516 309 \n", + "2 2654 282 \n", + "3 2676 196 \n", + "4 2546 185 \n", + ".. ... ... \n", + "295 3034 1 \n", + "296 3039 1 \n", + "297 3040 1 \n", + "298 3043 1 \n", + "299 3045 1 \n", + "\n", + " subreddit \n", + "0 [airsoft, bandnames, connecticut, thehiddenbar... \n", + "1 [posthardcore, metalcore, corejerk, iama, karm... \n", + "2 [locationbot, oldschoolcoolnsfw, uncomfortable... \n", + "3 [playmygame, circlebroke, tribes, conspiratard... \n", + "4 [leagueoflegends, kpop, turntablists, minecraf... \n", + ".. ... \n", + "295 [screenshots] \n", + "296 [leangains] \n", + "297 [agnostic] \n", + "298 [mario] \n", + "299 [vegproblems] \n", "\n", - "[300 rows x 2 columns]" + "[300 rows x 3 columns]" ] }, - "execution_count": 19, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "query = \"\"\"\n", - " CALL gds.louvain.stream('largest_connected_components2')\n", - " YIELD nodeId, communityId, intermediateCommunityIds\n", - " RETURN collect(gds.util.asNode(nodeId).node_id) AS Subreddits, communityId\n", - " ORDER BY size(Subreddits) DESC\n", + " CALL gds.graph.nodeProperties.stream('largest_connected_components', 'louvainCommunityId')\n", + " YIELD nodeId, propertyValue\n", + " WITH nodeId as nodeId, gds.util.asNode(nodeId).name AS node, propertyValue AS communityId\n", + " WITH communityId, collect(node) AS subreddit, size(collect(nodeId)) AS communitySize\n", + " RETURN communityId, communitySize, subreddit\n", + " ORDER BY communitySize DESC\n", "\"\"\"\n", - "wcc = gds.run_cypher(query)\n", - "wcc" + "\n", + "communities = gds.run_cypher(query)\n", + "communities" ] }, { - "attachments": {}, "cell_type": "markdown", "id": "65dcb952", "metadata": {}, From 393f7b10306c9d8d0e7e8bdf66c6614e67ce5e24 Mon Sep 17 00:00:00 2001 From: Kedar Ghule <41315903+kedarghule@users.noreply.github.com> Date: Tue, 16 May 2023 11:40:42 -0400 Subject: [PATCH 4/4] New changes --- examples/community-detection.ipynb | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/examples/community-detection.ipynb b/examples/community-detection.ipynb index bc3aa1b29..e288d8329 100644 --- a/examples/community-detection.ipynb +++ b/examples/community-detection.ipynb @@ -66,26 +66,15 @@ "metadata": {}, "outputs": [], "source": [ - "# # Replace with the actual connection URI and credentials\n", - "NEO4J_CONNECTION_URI = \"bolt://54.152.132.224:7687\"\n", - "NEO4J_USERNAME = \"neo4j\"\n", - "NEO4J_PASSWORD = \"scissors-hoists-tastes\"\n", + "NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n", + "NEO4J_AUTH = None\n", + "if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n", + " NEO4J_AUTH = (\n", + " os.environ.get(\"NEO4J_USER\"),\n", + " os.environ.get(\"NEO4J_PASSWORD\"),\n", + " )\n", "\n", - "# Client instantiation\n", - "gds = GraphDataScience(\n", - " NEO4J_CONNECTION_URI,\n", - " auth=(NEO4J_USERNAME, NEO4J_PASSWORD)\n", - ")\n", - "\n", - "# NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n", - "# NEO4J_AUTH = None\n", - "# if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n", - "# NEO4J_AUTH = (\n", - "# os.environ.get(\"NEO4J_USER\"),\n", - "# os.environ.get(\"NEO4J_PASSWORD\"),\n", - "# )\n", - "\n", - "# gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)" + "gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)" ] }, {