From 75ebc66cac94b316fee1ab82c856d42392a47a6d Mon Sep 17 00:00:00 2001
From: Kedar Ghule <41315903+kedarghule@users.noreply.github.com>
Date: Mon, 15 May 2023 16:02:34 -0400
Subject: [PATCH 1/4] added community detection jupyter notebook
---
examples/community-detection.ipynb | 1138 ++++++++++++++++++++++++++++
1 file changed, 1138 insertions(+)
create mode 100644 examples/community-detection.ipynb
diff --git a/examples/community-detection.ipynb b/examples/community-detection.ipynb
new file mode 100644
index 000000000..f4068f4ee
--- /dev/null
+++ b/examples/community-detection.ipynb
@@ -0,0 +1,1138 @@
+{
+ "cells": [
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "f083f11b",
+ "metadata": {},
+ "source": [
+ "## Community Detection\n",
+ "\n",
+ "The notebook shows the usage of the `graphdatascience` library for community detection on the Reddit Hyperlink Network dataset that can be downloaded [here](https://snap.stanford.edu/data/soc-RedditHyperlinks.html). We will use the `soc-redditHyperlinks-body.tsv` file.\n",
+ "\n",
+ "The tasks we cover here include performing initial graph preprocessing using Weakly Connected Components and then performing community detection on the largest component using the Louvain algorithm.\n",
+ "\n",
+ "### Setup\n",
+ "\n",
+ "We need to import the following libraries:\n",
+ "- graphdatascience\n",
+ "- neo4j\n",
+ "- pandas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "id": "3953e353",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "C:\\Users\\kedar\\anaconda3\\envs\\graph_stuff\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
+ "source": [
+ "from graphdatascience import GraphDataScience\n",
+ "from neo4j import GraphDatabase\n",
+ "from neo4j.exceptions import ServiceUnavailable\n",
+ "import pandas as pd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "id": "46b33d2d",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "2.3.5\n"
+ ]
+ }
+ ],
+ "source": [
+ "# # Replace with the actual connection URI and credentials\n",
+ "NEO4J_CONNECTION_URI = \"bolt://XXXXXXXXXXXXX\n",
+ "NEO4J_USERNAME = \"neo4j\"\n",
+ "NEO4J_PASSWORD = \"XXXXXXXXXXXXX\"\n",
+ "\n",
+ "# Client instantiation\n",
+ "gds = GraphDataScience(\n",
+ " NEO4J_CONNECTION_URI,\n",
+ " auth=(NEO4J_USERNAME, NEO4J_PASSWORD)\n",
+ ")\n",
+ "\n",
+ "print(gds.version())"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "48bd8af1",
+ "metadata": {},
+ "source": [
+ "### Importing the dataset\n",
+ "\n",
+ "We import the dataset as a pandas dataframe first. We work with only a subset of the dataset. The sampled data is only till 1st March 2014. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "id": "a8e677aa",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " SOURCE_SUBREDDIT | \n",
+ " TARGET_SUBREDDIT | \n",
+ " POST_ID | \n",
+ " TIMESTAMP | \n",
+ " LINK_SENTIMENT | \n",
+ " PROPERTIES | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " leagueoflegends | \n",
+ " teamredditteams | \n",
+ " 1u4nrps | \n",
+ " 2013-12-31 16:39:58 | \n",
+ " 1 | \n",
+ " 345.0,298.0,0.75652173913,0.0173913043478,0.08... | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " theredlion | \n",
+ " soccer | \n",
+ " 1u4qkd | \n",
+ " 2013-12-31 18:18:37 | \n",
+ " -1 | \n",
+ " 101.0,98.0,0.742574257426,0.019801980198,0.049... | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " inlandempire | \n",
+ " bikela | \n",
+ " 1u4qlzs | \n",
+ " 2014-01-01 14:54:35 | \n",
+ " 1 | \n",
+ " 85.0,85.0,0.752941176471,0.0235294117647,0.082... | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " nfl | \n",
+ " cfb | \n",
+ " 1u4sjvs | \n",
+ " 2013-12-31 17:37:55 | \n",
+ " 1 | \n",
+ " 1124.0,949.0,0.772241992883,0.0017793594306,0.... | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " playmygame | \n",
+ " gamedev | \n",
+ " 1u4w5ss | \n",
+ " 2014-01-01 02:51:13 | \n",
+ " 1 | \n",
+ " 715.0,622.0,0.777622377622,0.00699300699301,0.... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " SOURCE_SUBREDDIT TARGET_SUBREDDIT POST_ID TIMESTAMP \\\n",
+ "0 leagueoflegends teamredditteams 1u4nrps 2013-12-31 16:39:58 \n",
+ "1 theredlion soccer 1u4qkd 2013-12-31 18:18:37 \n",
+ "2 inlandempire bikela 1u4qlzs 2014-01-01 14:54:35 \n",
+ "3 nfl cfb 1u4sjvs 2013-12-31 17:37:55 \n",
+ "4 playmygame gamedev 1u4w5ss 2014-01-01 02:51:13 \n",
+ "\n",
+ " LINK_SENTIMENT PROPERTIES \n",
+ "0 1 345.0,298.0,0.75652173913,0.0173913043478,0.08... \n",
+ "1 -1 101.0,98.0,0.742574257426,0.019801980198,0.049... \n",
+ "2 1 85.0,85.0,0.752941176471,0.0235294117647,0.082... \n",
+ "3 1 1124.0,949.0,0.772241992883,0.0017793594306,0.... \n",
+ "4 1 715.0,622.0,0.777622377622,0.00699300699301,0.... "
+ ]
+ },
+ "execution_count": 3,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df = pd.read_csv('soc-redditHyperlinks-body.tsv', sep='\\t')\n",
+ "df = df[df['TIMESTAMP'] < \"2014-03-01 02:51:13\"]\n",
+ "df.head()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "be7b1c5d",
+ "metadata": {},
+ "source": [
+ "The `LINK_SENTIMENT` column tells if there is a positive (+1) or negative (-1) relationship from the source subreddit to destination subreddit. We filter out the negative sentiment relationships as they won't add to any meaningful communities. We also drop duplicate relationships."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "id": "2f153da1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " SOURCE_SUBREDDIT | \n",
+ " TARGET_SUBREDDIT | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " leagueoflegends | \n",
+ " teamredditteams | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " inlandempire | \n",
+ " bikela | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " nfl | \n",
+ " cfb | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " playmygame | \n",
+ " gamedev | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " dogemarket | \n",
+ " dogecoin | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " SOURCE_SUBREDDIT TARGET_SUBREDDIT\n",
+ "0 leagueoflegends teamredditteams\n",
+ "2 inlandempire bikela\n",
+ "3 nfl cfb\n",
+ "4 playmygame gamedev\n",
+ "5 dogemarket dogecoin"
+ ]
+ },
+ "execution_count": 4,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "relationship_df = df[df['LINK_SENTIMENT'] == 1]\n",
+ "columns = ['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT']\n",
+ "relationship_df = relationship_df[columns]\n",
+ "relationship_df = relationship_df.drop_duplicates()\n",
+ "relationship_df.head()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "59e6a5e9",
+ "metadata": {},
+ "source": [
+ "Next, we get a list of all the distinct nodes (source or destination) and load them as a dataframe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "id": "6266953f",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " SUBREDDIT | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " leagueoflegends | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " theredlion | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " inlandempire | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " nfl | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " playmygame | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " SUBREDDIT\n",
+ "0 leagueoflegends\n",
+ "1 theredlion\n",
+ "2 inlandempire\n",
+ "3 nfl\n",
+ "4 playmygame"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# get unique nodes for each column\n",
+ "source_nodes = pd.Series(df['SOURCE_SUBREDDIT']).unique()\n",
+ "target_nodes = pd.Series(df['TARGET_SUBREDDIT']).unique()\n",
+ "\n",
+ "# get unique nodes for both columns\n",
+ "all_nodes = pd.Series(pd.concat([df['SOURCE_SUBREDDIT'], df['TARGET_SUBREDDIT']])).unique()\n",
+ "\n",
+ "# create new dataframe with distinct nodes\n",
+ "nodes_df = pd.DataFrame({'SUBREDDIT': all_nodes})\n",
+ "nodes_df.head()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "c30a4378",
+ "metadata": {},
+ "source": [
+ "Finally, we load this data (nodes and edges) into a Graph Database and a GDS graph."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "id": "0b18e522",
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [],
+ "source": [
+ "driver = GraphDatabase.driver(NEO4J_CONNECTION_URI,\n",
+ " auth=(NEO4J_USERNAME, NEO4J_PASSWORD))\n",
+ "\n",
+ "# Create nodes and relationships in the graph using UNWIND\n",
+ "with driver.session() as session:\n",
+ " # Create nodes using UNWIND\n",
+ " nodes_list = nodes_df.to_dict('records')\n",
+ " session.run(\"UNWIND $nodes_list AS node_props CREATE (n:Subreddit {node_id: node_props.SUBREDDIT, node_label: node_props.SUBREDDIT})\", nodes_list=nodes_list)\n",
+ "\n",
+ " # Create relationships using UNWIND\n",
+ " edges_list = relationship_df.to_dict('records')\n",
+ " session.run(\"UNWIND $edges_list AS rel_props MATCH (source:Subreddit {node_id: rel_props.SOURCE_SUBREDDIT}), (target:Subreddit {node_id: rel_props.TARGET_SUBREDDIT}) CREATE (source)-[:HYPERLINKED_TO {relationship_type: rel_props.relationship_type}]->(target)\", edges_list=edges_list)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "id": "7a3509e8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Loading: 100%|██████████| 100.0/100 [00:09<00:00, 11.02%/s] \n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The projection took 9289 ms\n",
+ "Graph 'reddit' node count: 3801\n",
+ "Graph 'reddit' node labels: ['Subreddit']\n"
+ ]
+ }
+ ],
+ "source": [
+ "node_projection = [\"Subreddit\"]\n",
+ "relationship_projection = {\"HYPERLINKED_TO\": {\"orientation\": \"NATURAL\"}}\n",
+ "\n",
+ "G, result = gds.graph.project(\"reddit\", node_projection, relationship_projection) #, nodeProperties = ['node_id', 'node_label']\n",
+ "\n",
+ "print(f\"The projection took {result['projectMillis']} ms\")\n",
+ "\n",
+ "# We can use convenience methods on `G` to check if the projection looks correct\n",
+ "print(f\"Graph '{G.name()}' node count: {G.node_count()}\")\n",
+ "print(f\"Graph '{G.name()}' node labels: {G.node_labels()}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "id": "61aa6afe",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " degreeDistribution | \n",
+ " graphName | \n",
+ " database | \n",
+ " memoryUsage | \n",
+ " sizeInBytes | \n",
+ " nodeCount | \n",
+ " relationshipCount | \n",
+ " configuration | \n",
+ " density | \n",
+ " creationTime | \n",
+ " modificationTime | \n",
+ " schema | \n",
+ " schemaWithOrientation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " {'p99': 15, 'min': 0, 'max': 87, 'mean': 1.631... | \n",
+ " reddit | \n",
+ " neo4j | \n",
+ " 876 KiB | \n",
+ " 897448 | \n",
+ " 3801 | \n",
+ " 6200 | \n",
+ " {'relationshipProjection': {'HYPERLINKED_TO': ... | \n",
+ " 0.000429 | \n",
+ " 2023-05-14T15:29:30.028518203+00:00 | \n",
+ " 2023-05-14T15:29:30.931965175+00:00 | \n",
+ " {'graphProperties': {}, 'relationships': {'HYP... | \n",
+ " {'graphProperties': {}, 'relationships': {'HYP... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " degreeDistribution graphName database \\\n",
+ "0 {'p99': 15, 'min': 0, 'max': 87, 'mean': 1.631... reddit neo4j \n",
+ "\n",
+ " memoryUsage sizeInBytes nodeCount relationshipCount \\\n",
+ "0 876 KiB 897448 3801 6200 \n",
+ "\n",
+ " configuration density \\\n",
+ "0 {'relationshipProjection': {'HYPERLINKED_TO': ... 0.000429 \n",
+ "\n",
+ " creationTime modificationTime \\\n",
+ "0 2023-05-14T15:29:30.028518203+00:00 2023-05-14T15:29:30.931965175+00:00 \n",
+ "\n",
+ " schema \\\n",
+ "0 {'graphProperties': {}, 'relationships': {'HYP... \n",
+ "\n",
+ " schemaWithOrientation \n",
+ "0 {'graphProperties': {}, 'relationships': {'HYP... "
+ ]
+ },
+ "execution_count": 8,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gds.graph.list()"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "9c259471",
+ "metadata": {},
+ "source": [
+ "### Weakly Connected Components\n",
+ "\n",
+ "A graph dataset need not always be connected. That is, there may not exist a path from every node to \n",
+ "every other node in the graph dataset (subgraphs in it may not connected to each other at all). Hence, we \n",
+ "need to find the total number of nodes in each subgraph to see if it is big enough for further graph analysis. \n",
+ "Smaller subgraphs or lone nodes will not contribute to the community detection task and should be \n",
+ "eliminated. Weakly Connected Components is often used as one of the early steps of graph preprocessing.\n",
+ "\n",
+ "We use the [Weakly Connected Components](https://neo4j.com/docs/graph-data-science/2.4-preview/algorithms/wcc/) algorithm to find sets of connected nodes and assign each set a component id."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "id": "7a114af1",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "{'jobId': 'b69faaaa-a267-444c-82b2-d11c66f9a6a4', 'seedProperty': None, 'consecutiveIds': False, 'threshold': 0.0, 'logProgress': True, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'mutateProperty': 'componentId', 'concurrency': 4}\n"
+ ]
+ }
+ ],
+ "source": [
+ "df = gds.wcc.mutate(G, mutateProperty='componentId')\n",
+ "print(df.configuration)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "id": "04fd557e",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Subreddit [componentId]\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "G.node_properties()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "id": "2274a19a",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " componentId | \n",
+ " Subreddits | \n",
+ " Num_subreddits | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 0 | \n",
+ " [leagueoflegends, nfl, playmygame, dogemarket,... | \n",
+ " 3172 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 278 | \n",
+ " [orangered, orangeredacademy, pasto_range, per... | \n",
+ " 20 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 23 | \n",
+ " [thedoctorstravels, sirron, aislynisdead, game... | \n",
+ " 8 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 768 | \n",
+ " [iracing, simracing, redditracing, team_medioc... | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 832 | \n",
+ " [perfumeexchange, indiemakeupandmore, asianbea... | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 314 | \n",
+ " 3712 | \n",
+ " [aggies] | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 315 | \n",
+ " 3759 | \n",
+ " [brunei] | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 316 | \n",
+ " 3769 | \n",
+ " [descentintotyranny] | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 317 | \n",
+ " 3771 | \n",
+ " [outofthemetaloop] | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ " 318 | \n",
+ " 3773 | \n",
+ " [pokemonshowdown] | \n",
+ " 1 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
319 rows × 3 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " componentId Subreddits \\\n",
+ "0 0 [leagueoflegends, nfl, playmygame, dogemarket,... \n",
+ "1 278 [orangered, orangeredacademy, pasto_range, per... \n",
+ "2 23 [thedoctorstravels, sirron, aislynisdead, game... \n",
+ "3 768 [iracing, simracing, redditracing, team_medioc... \n",
+ "4 832 [perfumeexchange, indiemakeupandmore, asianbea... \n",
+ ".. ... ... \n",
+ "314 3712 [aggies] \n",
+ "315 3759 [brunei] \n",
+ "316 3769 [descentintotyranny] \n",
+ "317 3771 [outofthemetaloop] \n",
+ "318 3773 [pokemonshowdown] \n",
+ "\n",
+ " Num_subreddits \n",
+ "0 3172 \n",
+ "1 20 \n",
+ "2 8 \n",
+ "3 6 \n",
+ "4 6 \n",
+ ".. ... \n",
+ "314 1 \n",
+ "315 1 \n",
+ "316 1 \n",
+ "317 1 \n",
+ "318 1 \n",
+ "\n",
+ "[319 rows x 3 columns]"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "query = \"\"\"\n",
+ " CALL gds.wcc.stream('reddit')\n",
+ " YIELD nodeId, componentId\n",
+ " RETURN componentId, collect(gds.util.asNode(nodeId).node_id) AS Subreddits, size(collect(gds.util.asNode(nodeId).node_id)) AS Num_subreddits\n",
+ " ORDER BY size(Subreddits) DESC\n",
+ "\"\"\"\n",
+ "wcc = gds.run_cypher(query)\n",
+ "wcc"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "9a2355cb",
+ "metadata": {},
+ "source": [
+ "We can see that the component with Id 0 has the max number of subreddits = 3172. So we will work only with that subgraph."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "id": "d1994b04",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "Largest_CC, _ = gds.beta.graph.project.subgraph(\n",
+ " 'largest_connected_components2', \n",
+ " G,\n",
+ " 'n.componentId=0', \n",
+ " '*'\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "id": "f3e613a1",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Graph({'graphName': 'largest_connected_components2', 'nodeCount': 3172, 'relationshipCount': 5858, 'database': 'neo4j', 'configuration': {'relationshipProperties': {}, 'creationTime': neo4j.time.DateTime(2023, 5, 14, 15, 29, 52, 126057108, tzinfo=), 'validateRelationships': False, 'nodeFilter': 'n.componentId=0', 'relationshipFilter': '*', 'nodeProperties': {}, 'concurrency': 4, 'relationshipProjection': {'HYPERLINKED_TO': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'HYPERLINKED_TO', 'properties': {}}}, 'jobId': 'e1d3750e-61f5-4928-b16c-4f5f566e09f1', 'nodeProjection': {'Subreddit': {'label': 'Subreddit', 'properties': {}}}, 'logProgress': True, 'readConcurrency': 4, 'sudo': False, 'parameters': {}}, 'schema': {'graphProperties': {}, 'relationships': {'HYPERLINKED_TO': {}}, 'nodes': {'Subreddit': {'componentId': 'Integer (DefaultValue(-9223372036854775808), TRANSIENT)'}}}, 'memoryUsage': '901 KiB'})"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Largest_CC"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "17942d04",
+ "metadata": {},
+ "source": [
+ "### Community Detection using Louvain\n",
+ "\n",
+ "We use the [Louvain](https://neo4j.com/docs/graph-data-science/2.4-preview/algorithms/louvain/) algorithm to detect communities in our subgraph and assign a louvainCommunityId to each community."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "id": "def26464",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "Louvain: 100%|██████████| 100.0/100 [00:12<00:00, 7.95%/s]\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "mutateMillis 0\n",
+ "nodePropertiesWritten 3172\n",
+ "modularity 0.58988\n",
+ "modularities [0.4494089141198883, 0.5373675216145954, 0.555...\n",
+ "ranLevels 10\n",
+ "communityCount 300\n",
+ "communityDistribution {'p99': 196, 'min': 1, 'max': 382, 'mean': 10....\n",
+ "postProcessingMillis 22\n",
+ "preProcessingMillis 1\n",
+ "computeMillis 12974\n",
+ "configuration {'maxIterations': 10, 'seedProperty': None, 'c...\n",
+ "Name: 0, dtype: object"
+ ]
+ },
+ "execution_count": 14,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2 = gds.louvain.mutate(Largest_CC, mutateProperty='louvainCommunityId')\n",
+ "df2"
+ ]
+ },
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "7563e824",
+ "metadata": {},
+ "source": [
+ "We get a modularity score of 0.5898 for our community detection algorithm."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "id": "46969ec5",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.5898798012505129"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "df2.modularity"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "id": "15fc2baa",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Subreddit [componentId, louvainCommunityId]\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Largest_CC.node_properties()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "id": "858c65be",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " communityCount | \n",
+ " modularity | \n",
+ " modularities | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 300 | \n",
+ " 0.58988 | \n",
+ " [0.4494089141198883, 0.5373675216145954, 0.555... | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " communityCount modularity \\\n",
+ "0 300 0.58988 \n",
+ "\n",
+ " modularities \n",
+ "0 [0.4494089141198883, 0.5373675216145954, 0.555... "
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "query = \"\"\"\n",
+ " CALL gds.louvain.write('largest_connected_components2', { writeProperty: 'louvainCommunityId' })\n",
+ " YIELD communityCount, modularity, modularities\n",
+ "\"\"\"\n",
+ "communities = gds.run_cypher(query)\n",
+ "communities"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "id": "22f73aea",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Subreddit [componentId, louvainCommunityId]\n",
+ "dtype: object"
+ ]
+ },
+ "execution_count": 18,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "Largest_CC.node_properties()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 19,
+ "id": "19ccfcb8",
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Subreddits | \n",
+ " communityId | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " [airsoft, bandnames, connecticut, thehiddenbar... | \n",
+ " 2406 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " [posthardcore, metalcore, corejerk, iama, karm... | \n",
+ " 2612 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " [locationbot, oldschoolcoolnsfw, uncomfortable... | \n",
+ " 2579 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " [playmygame, circlebroke, tribes, conspiratard... | \n",
+ " 2676 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " [radioreddit, autism, modhelp, digital_immorta... | \n",
+ " 3158 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 295 | \n",
+ " [banishedmaps] | \n",
+ " 3032 | \n",
+ "
\n",
+ " \n",
+ " 296 | \n",
+ " [screenshots] | \n",
+ " 3034 | \n",
+ "
\n",
+ " \n",
+ " 297 | \n",
+ " [leangains] | \n",
+ " 3039 | \n",
+ "
\n",
+ " \n",
+ " 298 | \n",
+ " [agnostic] | \n",
+ " 3040 | \n",
+ "
\n",
+ " \n",
+ " 299 | \n",
+ " [mario] | \n",
+ " 3043 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
300 rows × 2 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Subreddits communityId\n",
+ "0 [airsoft, bandnames, connecticut, thehiddenbar... 2406\n",
+ "1 [posthardcore, metalcore, corejerk, iama, karm... 2612\n",
+ "2 [locationbot, oldschoolcoolnsfw, uncomfortable... 2579\n",
+ "3 [playmygame, circlebroke, tribes, conspiratard... 2676\n",
+ "4 [radioreddit, autism, modhelp, digital_immorta... 3158\n",
+ ".. ... ...\n",
+ "295 [banishedmaps] 3032\n",
+ "296 [screenshots] 3034\n",
+ "297 [leangains] 3039\n",
+ "298 [agnostic] 3040\n",
+ "299 [mario] 3043\n",
+ "\n",
+ "[300 rows x 2 columns]"
+ ]
+ },
+ "execution_count": 19,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "query = \"\"\"\n",
+ " CALL gds.louvain.stream('largest_connected_components2')\n",
+ " YIELD nodeId, communityId, intermediateCommunityIds\n",
+ " RETURN collect(gds.util.asNode(nodeId).node_id) AS Subreddits, communityId\n",
+ " ORDER BY size(Subreddits) DESC\n",
+ "\"\"\"\n",
+ "wcc = gds.run_cypher(query)\n",
+ "wcc"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "37b59b5b",
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.10.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
From bf70d3d9093c83f9ae839c928ab3c5592741ea5f Mon Sep 17 00:00:00 2001
From: Kedar Ghule <41315903+kedarghule@users.noreply.github.com>
Date: Mon, 15 May 2023 16:07:53 -0400
Subject: [PATCH 2/4] added reference to dataset
---
examples/community-detection.ipynb | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/examples/community-detection.ipynb b/examples/community-detection.ipynb
index f4068f4ee..9f55fc594 100644
--- a/examples/community-detection.ipynb
+++ b/examples/community-detection.ipynb
@@ -1105,6 +1105,16 @@
"wcc"
]
},
+ {
+ "attachments": {},
+ "cell_type": "markdown",
+ "id": "65dcb952",
+ "metadata": {},
+ "source": [
+ "### References\n",
+ "S. Kumar, W.L. Hamilton, J. Leskovec, D. Jurafsky. Community Interaction and Conflict on the Web. World Wide Web Conference, 2018."
+ ]
+ },
{
"cell_type": "code",
"execution_count": null,
From 78e4b4522bae17c1b36e2d54269b57f11cbbffd8 Mon Sep 17 00:00:00 2001
From: Kedar Ghule <41315903+kedarghule@users.noreply.github.com>
Date: Tue, 16 May 2023 11:39:20 -0400
Subject: [PATCH 3/4] Made changes as per comments
---
examples/community-detection.ipynb | 475 +++++++++++++++--------------
1 file changed, 249 insertions(+), 226 deletions(-)
diff --git a/examples/community-detection.ipynb b/examples/community-detection.ipynb
index 9f55fc594..bc3aa1b29 100644
--- a/examples/community-detection.ipynb
+++ b/examples/community-detection.ipynb
@@ -1,12 +1,29 @@
{
"cells": [
{
- "attachments": {},
+ "cell_type": "markdown",
+ "id": "5b60d8ba",
+ "metadata": {},
+ "source": [
+ "# Community Detection"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "3e2fb927",
+ "metadata": {},
+ "source": [
+ "\n",
+ "
\n",
+ ""
+ ]
+ },
+ {
"cell_type": "markdown",
"id": "f083f11b",
"metadata": {},
"source": [
- "## Community Detection\n",
+ "This Jupyter notebook is hosted [here](https://github.com/neo4j/graph-data-science-client/blob/main/examples/community-detection.ipynb) in the Neo4j Graph Data Science Client Github repository.\n",
"\n",
"The notebook shows the usage of the `graphdatascience` library for community detection on the Reddit Hyperlink Network dataset that can be downloaded [here](https://snap.stanford.edu/data/soc-RedditHyperlinks.html). We will use the `soc-redditHyperlinks-body.tsv` file.\n",
"\n",
@@ -47,20 +64,12 @@
"execution_count": 2,
"id": "46b33d2d",
"metadata": {},
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "2.3.5\n"
- ]
- }
- ],
+ "outputs": [],
"source": [
"# # Replace with the actual connection URI and credentials\n",
- "NEO4J_CONNECTION_URI = \"bolt://XXXXXXXXXXXXX\n",
+ "NEO4J_CONNECTION_URI = \"bolt://54.152.132.224:7687\"\n",
"NEO4J_USERNAME = \"neo4j\"\n",
- "NEO4J_PASSWORD = \"XXXXXXXXXXXXX\"\n",
+ "NEO4J_PASSWORD = \"scissors-hoists-tastes\"\n",
"\n",
"# Client instantiation\n",
"gds = GraphDataScience(\n",
@@ -68,11 +77,18 @@
" auth=(NEO4J_USERNAME, NEO4J_PASSWORD)\n",
")\n",
"\n",
- "print(gds.version())"
+ "# NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n",
+ "# NEO4J_AUTH = None\n",
+ "# if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n",
+ "# NEO4J_AUTH = (\n",
+ "# os.environ.get(\"NEO4J_USER\"),\n",
+ "# os.environ.get(\"NEO4J_PASSWORD\"),\n",
+ "# )\n",
+ "\n",
+ "# gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "48bd8af1",
"metadata": {},
@@ -189,13 +205,12 @@
}
],
"source": [
- "df = pd.read_csv('soc-redditHyperlinks-body.tsv', sep='\\t')\n",
+ "df = pd.read_csv('https://snap.stanford.edu/data/soc-redditHyperlinks-body.tsv', sep='\\t')\n",
"df = df[df['TIMESTAMP'] < \"2014-03-01 02:51:13\"]\n",
"df.head()"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "be7b1c5d",
"metadata": {},
@@ -287,7 +302,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "59e6a5e9",
"metadata": {},
@@ -378,7 +392,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "c30a4378",
"metadata": {},
@@ -393,20 +406,54 @@
"metadata": {
"scrolled": true
},
- "outputs": [],
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ "Empty DataFrame\n",
+ "Columns: []\n",
+ "Index: []"
+ ]
+ },
+ "execution_count": 6,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
"source": [
- "driver = GraphDatabase.driver(NEO4J_CONNECTION_URI,\n",
- " auth=(NEO4J_USERNAME, NEO4J_PASSWORD))\n",
- "\n",
- "# Create nodes and relationships in the graph using UNWIND\n",
- "with driver.session() as session:\n",
- " # Create nodes using UNWIND\n",
- " nodes_list = nodes_df.to_dict('records')\n",
- " session.run(\"UNWIND $nodes_list AS node_props CREATE (n:Subreddit {node_id: node_props.SUBREDDIT, node_label: node_props.SUBREDDIT})\", nodes_list=nodes_list)\n",
+ "gds.run_cypher(\n",
+ " \"UNWIND $nodes_list AS node_props CREATE (n:Subreddit {name: node_props.SUBREDDIT})\",\n",
+ " params = {'nodes_list': nodes_df.to_dict('records')})\n",
"\n",
- " # Create relationships using UNWIND\n",
- " edges_list = relationship_df.to_dict('records')\n",
- " session.run(\"UNWIND $edges_list AS rel_props MATCH (source:Subreddit {node_id: rel_props.SOURCE_SUBREDDIT}), (target:Subreddit {node_id: rel_props.TARGET_SUBREDDIT}) CREATE (source)-[:HYPERLINKED_TO {relationship_type: rel_props.relationship_type}]->(target)\", edges_list=edges_list)"
+ "gds.run_cypher(\n",
+ " \"UNWIND $edges_list AS rel_props MATCH (source:Subreddit {name: rel_props.SOURCE_SUBREDDIT}), (target:Subreddit {name: rel_props.TARGET_SUBREDDIT}) CREATE (source)-[:HYPERLINKED_TO {relationship_type: rel_props.relationship_type}]->(target)\", \n",
+ " params = {'edges_list': relationship_df.to_dict('records')})"
]
},
{
@@ -419,14 +466,14 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Loading: 100%|██████████| 100.0/100 [00:09<00:00, 11.02%/s] \n"
+ "Loading: 100%|██████████| 100.0/100 [00:10<00:00, 9.27%/s] \n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
- "The projection took 9289 ms\n",
+ "The projection took 11405 ms\n",
"Graph 'reddit' node count: 3801\n",
"Graph 'reddit' node labels: ['Subreddit']\n"
]
@@ -499,8 +546,8 @@
" 6200 | \n",
" {'relationshipProjection': {'HYPERLINKED_TO': ... | \n",
" 0.000429 | \n",
- " 2023-05-14T15:29:30.028518203+00:00 | \n",
- " 2023-05-14T15:29:30.931965175+00:00 | \n",
+ " 2023-05-16T14:07:25.933283995+00:00 | \n",
+ " 2023-05-16T14:07:27.112253586+00:00 | \n",
" {'graphProperties': {}, 'relationships': {'HYP... | \n",
" {'graphProperties': {}, 'relationships': {'HYP... | \n",
" \n",
@@ -519,7 +566,7 @@
"0 {'relationshipProjection': {'HYPERLINKED_TO': ... 0.000429 \n",
"\n",
" creationTime modificationTime \\\n",
- "0 2023-05-14T15:29:30.028518203+00:00 2023-05-14T15:29:30.931965175+00:00 \n",
+ "0 2023-05-16T14:07:25.933283995+00:00 2023-05-16T14:07:27.112253586+00:00 \n",
"\n",
" schema \\\n",
"0 {'graphProperties': {}, 'relationships': {'HYP... \n",
@@ -538,7 +585,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "9c259471",
"metadata": {},
@@ -564,7 +610,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "{'jobId': 'b69faaaa-a267-444c-82b2-d11c66f9a6a4', 'seedProperty': None, 'consecutiveIds': False, 'threshold': 0.0, 'logProgress': True, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'mutateProperty': 'componentId', 'concurrency': 4}\n"
+ "{'jobId': 'c7d9036d-b9a5-4d91-8d95-70bccfd67c2d', 'seedProperty': None, 'consecutiveIds': False, 'threshold': 0.0, 'logProgress': True, 'nodeLabels': ['*'], 'sudo': False, 'relationshipTypes': ['*'], 'mutateProperty': 'componentId', 'concurrency': 4}\n"
]
}
],
@@ -595,9 +641,19 @@
"G.node_properties()"
]
},
+ {
+ "cell_type": "markdown",
+ "id": "23a0a487",
+ "metadata": {},
+ "source": [
+ "Next, we will see the size of each connected component and depending on that, we can pick the subgraph that needs further analysis.\n",
+ "\n",
+ "We use `run_cypher` here instead of the direct gds client call since we want to see the size of the connected components."
+ ]
+ },
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 14,
"id": "2274a19a",
"metadata": {},
"outputs": [
@@ -623,40 +679,40 @@
" \n",
" | \n",
" componentId | \n",
- " Subreddits | \n",
- " Num_subreddits | \n",
+ " communitySize | \n",
+ " subreddit | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
- " [leagueoflegends, nfl, playmygame, dogemarket,... | \n",
" 3172 | \n",
+ " [leagueoflegends, nfl, playmygame, dogemarket,... | \n",
"
\n",
" \n",
" 1 | \n",
" 278 | \n",
- " [orangered, orangeredacademy, pasto_range, per... | \n",
" 20 | \n",
+ " [orangered, orangeredacademy, pasto_range, per... | \n",
"
\n",
" \n",
" 2 | \n",
" 23 | \n",
- " [thedoctorstravels, sirron, aislynisdead, game... | \n",
" 8 | \n",
+ " [thedoctorstravels, sirron, aislynisdead, game... | \n",
"
\n",
" \n",
" 3 | \n",
" 768 | \n",
- " [iracing, simracing, redditracing, team_medioc... | \n",
" 6 | \n",
+ " [iracing, simracing, redditracing, team_medioc... | \n",
"
\n",
" \n",
" 4 | \n",
" 832 | \n",
- " [perfumeexchange, indiemakeupandmore, asianbea... | \n",
" 6 | \n",
+ " [perfumeexchange, indiemakeupandmore, asianbea... | \n",
"
\n",
" \n",
" ... | \n",
@@ -667,32 +723,32 @@
"
\n",
" 314 | \n",
" 3712 | \n",
- " [aggies] | \n",
" 1 | \n",
+ " [aggies] | \n",
"
\n",
" \n",
" 315 | \n",
" 3759 | \n",
- " [brunei] | \n",
" 1 | \n",
+ " [brunei] | \n",
"
\n",
" \n",
" 316 | \n",
" 3769 | \n",
- " [descentintotyranny] | \n",
" 1 | \n",
+ " [descentintotyranny] | \n",
"
\n",
" \n",
" 317 | \n",
" 3771 | \n",
- " [outofthemetaloop] | \n",
" 1 | \n",
+ " [outofthemetaloop] | \n",
"
\n",
" \n",
" 318 | \n",
" 3773 | \n",
- " [pokemonshowdown] | \n",
" 1 | \n",
+ " [pokemonshowdown] | \n",
"
\n",
" \n",
"\n",
@@ -700,53 +756,70 @@
""
],
"text/plain": [
- " componentId Subreddits \\\n",
- "0 0 [leagueoflegends, nfl, playmygame, dogemarket,... \n",
- "1 278 [orangered, orangeredacademy, pasto_range, per... \n",
- "2 23 [thedoctorstravels, sirron, aislynisdead, game... \n",
- "3 768 [iracing, simracing, redditracing, team_medioc... \n",
- "4 832 [perfumeexchange, indiemakeupandmore, asianbea... \n",
- ".. ... ... \n",
- "314 3712 [aggies] \n",
- "315 3759 [brunei] \n",
- "316 3769 [descentintotyranny] \n",
- "317 3771 [outofthemetaloop] \n",
- "318 3773 [pokemonshowdown] \n",
+ " componentId communitySize \\\n",
+ "0 0 3172 \n",
+ "1 278 20 \n",
+ "2 23 8 \n",
+ "3 768 6 \n",
+ "4 832 6 \n",
+ ".. ... ... \n",
+ "314 3712 1 \n",
+ "315 3759 1 \n",
+ "316 3769 1 \n",
+ "317 3771 1 \n",
+ "318 3773 1 \n",
"\n",
- " Num_subreddits \n",
- "0 3172 \n",
- "1 20 \n",
- "2 8 \n",
- "3 6 \n",
- "4 6 \n",
- ".. ... \n",
- "314 1 \n",
- "315 1 \n",
- "316 1 \n",
- "317 1 \n",
- "318 1 \n",
+ " subreddit \n",
+ "0 [leagueoflegends, nfl, playmygame, dogemarket,... \n",
+ "1 [orangered, orangeredacademy, pasto_range, per... \n",
+ "2 [thedoctorstravels, sirron, aislynisdead, game... \n",
+ "3 [iracing, simracing, redditracing, team_medioc... \n",
+ "4 [perfumeexchange, indiemakeupandmore, asianbea... \n",
+ ".. ... \n",
+ "314 [aggies] \n",
+ "315 [brunei] \n",
+ "316 [descentintotyranny] \n",
+ "317 [outofthemetaloop] \n",
+ "318 [pokemonshowdown] \n",
"\n",
"[319 rows x 3 columns]"
]
},
- "execution_count": 11,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"\"\"\n",
- " CALL gds.wcc.stream('reddit')\n",
- " YIELD nodeId, componentId\n",
- " RETURN componentId, collect(gds.util.asNode(nodeId).node_id) AS Subreddits, size(collect(gds.util.asNode(nodeId).node_id)) AS Num_subreddits\n",
- " ORDER BY size(Subreddits) DESC\n",
+ " CALL gds.graph.nodeProperties.stream('reddit', 'componentId')\n",
+ " YIELD nodeId, propertyValue\n",
+ " WITH nodeId as nodeId, gds.util.asNode(nodeId).name AS node, propertyValue AS componentId\n",
+ " WITH componentId, collect(node) AS subreddit, size(collect(nodeId)) AS communitySize\n",
+ " RETURN componentId, communitySize, subreddit\n",
+ " ORDER BY communitySize DESC\n",
"\"\"\"\n",
+ "\n",
+ "# query = \"\"\"\n",
+ "# CALL gds.graph.nodeProperties.stream('reddit', 'componentId')\n",
+ "# YIELD name, propertyValue\n",
+ "# WITH name as name, gds.util.asNode(name).name AS name, propertyValue AS componentId\n",
+ "# WITH componentId, collect(name) AS subreddits, size(collect(name)) AS communitySize\n",
+ "# RETURN componentId, communitySize, subreddits\n",
+ "# ORDER BY communitySize DESC\n",
+ "# \"\"\"\n",
+ "\n",
+ "# query = \"\"\"\n",
+ "# CALL gds.wcc.stream('reddit')\n",
+ "# YIELD nodeId, componentId\n",
+ "# RETURN componentId, collect(gds.util.asNode(nodeId).node_id) AS Subreddits, size(collect(gds.util.asNode(nodeId).node_id)) AS Num_subreddits\n",
+ "# ORDER BY size(Subreddits) DESC\n",
+ "# \"\"\"\n",
"wcc = gds.run_cypher(query)\n",
"wcc"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "9a2355cb",
"metadata": {},
@@ -756,13 +829,13 @@
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 17,
"id": "d1994b04",
"metadata": {},
"outputs": [],
"source": [
"Largest_CC, _ = gds.beta.graph.project.subgraph(\n",
- " 'largest_connected_components2', \n",
+ " 'largest_connected_components', \n",
" G,\n",
" 'n.componentId=0', \n",
" '*'\n",
@@ -771,17 +844,17 @@
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 18,
"id": "f3e613a1",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Graph({'graphName': 'largest_connected_components2', 'nodeCount': 3172, 'relationshipCount': 5858, 'database': 'neo4j', 'configuration': {'relationshipProperties': {}, 'creationTime': neo4j.time.DateTime(2023, 5, 14, 15, 29, 52, 126057108, tzinfo=), 'validateRelationships': False, 'nodeFilter': 'n.componentId=0', 'relationshipFilter': '*', 'nodeProperties': {}, 'concurrency': 4, 'relationshipProjection': {'HYPERLINKED_TO': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'HYPERLINKED_TO', 'properties': {}}}, 'jobId': 'e1d3750e-61f5-4928-b16c-4f5f566e09f1', 'nodeProjection': {'Subreddit': {'label': 'Subreddit', 'properties': {}}}, 'logProgress': True, 'readConcurrency': 4, 'sudo': False, 'parameters': {}}, 'schema': {'graphProperties': {}, 'relationships': {'HYPERLINKED_TO': {}}, 'nodes': {'Subreddit': {'componentId': 'Integer (DefaultValue(-9223372036854775808), TRANSIENT)'}}}, 'memoryUsage': '901 KiB'})"
+ "Graph({'graphName': 'largest_connected_components', 'nodeCount': 3172, 'relationshipCount': 5858, 'database': 'neo4j', 'configuration': {'relationshipProperties': {}, 'creationTime': neo4j.time.DateTime(2023, 5, 16, 14, 43, 14, 779680794, tzinfo=), 'validateRelationships': False, 'nodeFilter': 'n.componentId=0', 'relationshipFilter': '*', 'nodeProperties': {}, 'concurrency': 4, 'relationshipProjection': {'HYPERLINKED_TO': {'orientation': 'NATURAL', 'indexInverse': False, 'aggregation': 'DEFAULT', 'type': 'HYPERLINKED_TO', 'properties': {}}}, 'jobId': 'f3f02536-8052-4bc9-b143-ca649e16e0d0', 'nodeProjection': {'Subreddit': {'label': 'Subreddit', 'properties': {}}}, 'logProgress': True, 'readConcurrency': 4, 'sudo': False, 'parameters': {}}, 'schema': {'graphProperties': {}, 'relationships': {'HYPERLINKED_TO': {}}, 'nodes': {'Subreddit': {'componentId': 'Integer (DefaultValue(-9223372036854775808), TRANSIENT)'}}}, 'memoryUsage': '901 KiB'})"
]
},
- "execution_count": 13,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
@@ -791,7 +864,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "17942d04",
"metadata": {},
@@ -803,7 +875,7 @@
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 19,
"id": "def26464",
"metadata": {},
"outputs": [
@@ -811,27 +883,27 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "Louvain: 100%|██████████| 100.0/100 [00:12<00:00, 7.95%/s]\n"
+ "Louvain: 100%|██████████| 100.0/100 [00:11<00:00, 8.74%/s]\n"
]
},
{
"data": {
"text/plain": [
- "mutateMillis 0\n",
+ "mutateMillis 4\n",
"nodePropertiesWritten 3172\n",
- "modularity 0.58988\n",
- "modularities [0.4494089141198883, 0.5373675216145954, 0.555...\n",
+ "modularity 0.587643\n",
+ "modularities [0.4494090889646058, 0.5377130147763601, 0.555...\n",
"ranLevels 10\n",
"communityCount 300\n",
"communityDistribution {'p99': 196, 'min': 1, 'max': 382, 'mean': 10....\n",
- "postProcessingMillis 22\n",
- "preProcessingMillis 1\n",
- "computeMillis 12974\n",
+ "postProcessingMillis 17\n",
+ "preProcessingMillis 0\n",
+ "computeMillis 12084\n",
"configuration {'maxIterations': 10, 'seedProperty': None, 'c...\n",
"Name: 0, dtype: object"
]
},
- "execution_count": 14,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -842,7 +914,6 @@
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "7563e824",
"metadata": {},
@@ -852,115 +923,40 @@
},
{
"cell_type": "code",
- "execution_count": 15,
- "id": "46969ec5",
- "metadata": {},
- "outputs": [
- {
- "data": {
- "text/plain": [
- "0.5898798012505129"
- ]
- },
- "execution_count": 15,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "df2.modularity"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 16,
- "id": "15fc2baa",
+ "execution_count": 20,
+ "id": "858c65be",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
- "Subreddit [componentId, louvainCommunityId]\n",
- "dtype: object"
+ "writeMillis 578\n",
+ "graphName largest_connected_components\n",
+ "nodeProperties [louvainCommunityId]\n",
+ "propertiesWritten 3172\n",
+ "Name: 0, dtype: object"
]
},
- "execution_count": 16,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "Largest_CC.node_properties()"
+ "gds.graph.nodeProperties.write(Largest_CC, [\"louvainCommunityId\"])"
]
},
{
- "cell_type": "code",
- "execution_count": 17,
- "id": "858c65be",
+ "cell_type": "markdown",
+ "id": "afb104e0",
"metadata": {},
- "outputs": [
- {
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " communityCount | \n",
- " modularity | \n",
- " modularities | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " 0 | \n",
- " 300 | \n",
- " 0.58988 | \n",
- " [0.4494089141198883, 0.5373675216145954, 0.555... | \n",
- "
\n",
- " \n",
- "
\n",
- "
"
- ],
- "text/plain": [
- " communityCount modularity \\\n",
- "0 300 0.58988 \n",
- "\n",
- " modularities \n",
- "0 [0.4494089141198883, 0.5373675216145954, 0.555... "
- ]
- },
- "execution_count": 17,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
"source": [
- "query = \"\"\"\n",
- " CALL gds.louvain.write('largest_connected_components2', { writeProperty: 'louvainCommunityId' })\n",
- " YIELD communityCount, modularity, modularities\n",
- "\"\"\"\n",
- "communities = gds.run_cypher(query)\n",
- "communities"
+ "We can also check that the property was written by the below command."
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 21,
"id": "22f73aea",
"metadata": {},
"outputs": [
@@ -971,7 +967,7 @@
"dtype: object"
]
},
- "execution_count": 18,
+ "execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
@@ -982,7 +978,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 23,
"id": "19ccfcb8",
"metadata": {},
"outputs": [
@@ -1007,106 +1003,133 @@
" \n",
" \n",
" | \n",
- " Subreddits | \n",
" communityId | \n",
+ " communitySize | \n",
+ " subreddit | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
- " [airsoft, bandnames, connecticut, thehiddenbar... | \n",
" 2406 | \n",
+ " 382 | \n",
+ " [airsoft, bandnames, connecticut, thehiddenbar... | \n",
"
\n",
" \n",
" 1 | \n",
+ " 2516 | \n",
+ " 309 | \n",
" [posthardcore, metalcore, corejerk, iama, karm... | \n",
- " 2612 | \n",
"
\n",
" \n",
" 2 | \n",
+ " 2654 | \n",
+ " 282 | \n",
" [locationbot, oldschoolcoolnsfw, uncomfortable... | \n",
- " 2579 | \n",
"
\n",
" \n",
" 3 | \n",
- " [playmygame, circlebroke, tribes, conspiratard... | \n",
" 2676 | \n",
+ " 196 | \n",
+ " [playmygame, circlebroke, tribes, conspiratard... | \n",
"
\n",
" \n",
" 4 | \n",
- " [radioreddit, autism, modhelp, digital_immorta... | \n",
- " 3158 | \n",
+ " 2546 | \n",
+ " 185 | \n",
+ " [leagueoflegends, kpop, turntablists, minecraf... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
+ " ... | \n",
"
\n",
" \n",
" 295 | \n",
- " [banishedmaps] | \n",
- " 3032 | \n",
+ " 3034 | \n",
+ " 1 | \n",
+ " [screenshots] | \n",
"
\n",
" \n",
" 296 | \n",
- " [screenshots] | \n",
- " 3034 | \n",
+ " 3039 | \n",
+ " 1 | \n",
+ " [leangains] | \n",
"
\n",
" \n",
" 297 | \n",
- " [leangains] | \n",
- " 3039 | \n",
+ " 3040 | \n",
+ " 1 | \n",
+ " [agnostic] | \n",
"
\n",
" \n",
" 298 | \n",
- " [agnostic] | \n",
- " 3040 | \n",
+ " 3043 | \n",
+ " 1 | \n",
+ " [mario] | \n",
"
\n",
" \n",
" 299 | \n",
- " [mario] | \n",
- " 3043 | \n",
+ " 3045 | \n",
+ " 1 | \n",
+ " [vegproblems] | \n",
"
\n",
" \n",
"\n",
- "300 rows × 2 columns
\n",
+ "300 rows × 3 columns
\n",
""
],
"text/plain": [
- " Subreddits communityId\n",
- "0 [airsoft, bandnames, connecticut, thehiddenbar... 2406\n",
- "1 [posthardcore, metalcore, corejerk, iama, karm... 2612\n",
- "2 [locationbot, oldschoolcoolnsfw, uncomfortable... 2579\n",
- "3 [playmygame, circlebroke, tribes, conspiratard... 2676\n",
- "4 [radioreddit, autism, modhelp, digital_immorta... 3158\n",
- ".. ... ...\n",
- "295 [banishedmaps] 3032\n",
- "296 [screenshots] 3034\n",
- "297 [leangains] 3039\n",
- "298 [agnostic] 3040\n",
- "299 [mario] 3043\n",
+ " communityId communitySize \\\n",
+ "0 2406 382 \n",
+ "1 2516 309 \n",
+ "2 2654 282 \n",
+ "3 2676 196 \n",
+ "4 2546 185 \n",
+ ".. ... ... \n",
+ "295 3034 1 \n",
+ "296 3039 1 \n",
+ "297 3040 1 \n",
+ "298 3043 1 \n",
+ "299 3045 1 \n",
+ "\n",
+ " subreddit \n",
+ "0 [airsoft, bandnames, connecticut, thehiddenbar... \n",
+ "1 [posthardcore, metalcore, corejerk, iama, karm... \n",
+ "2 [locationbot, oldschoolcoolnsfw, uncomfortable... \n",
+ "3 [playmygame, circlebroke, tribes, conspiratard... \n",
+ "4 [leagueoflegends, kpop, turntablists, minecraf... \n",
+ ".. ... \n",
+ "295 [screenshots] \n",
+ "296 [leangains] \n",
+ "297 [agnostic] \n",
+ "298 [mario] \n",
+ "299 [vegproblems] \n",
"\n",
- "[300 rows x 2 columns]"
+ "[300 rows x 3 columns]"
]
},
- "execution_count": 19,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"query = \"\"\"\n",
- " CALL gds.louvain.stream('largest_connected_components2')\n",
- " YIELD nodeId, communityId, intermediateCommunityIds\n",
- " RETURN collect(gds.util.asNode(nodeId).node_id) AS Subreddits, communityId\n",
- " ORDER BY size(Subreddits) DESC\n",
+ " CALL gds.graph.nodeProperties.stream('largest_connected_components', 'louvainCommunityId')\n",
+ " YIELD nodeId, propertyValue\n",
+ " WITH nodeId as nodeId, gds.util.asNode(nodeId).name AS node, propertyValue AS communityId\n",
+ " WITH communityId, collect(node) AS subreddit, size(collect(nodeId)) AS communitySize\n",
+ " RETURN communityId, communitySize, subreddit\n",
+ " ORDER BY communitySize DESC\n",
"\"\"\"\n",
- "wcc = gds.run_cypher(query)\n",
- "wcc"
+ "\n",
+ "communities = gds.run_cypher(query)\n",
+ "communities"
]
},
{
- "attachments": {},
"cell_type": "markdown",
"id": "65dcb952",
"metadata": {},
From 393f7b10306c9d8d0e7e8bdf66c6614e67ce5e24 Mon Sep 17 00:00:00 2001
From: Kedar Ghule <41315903+kedarghule@users.noreply.github.com>
Date: Tue, 16 May 2023 11:40:42 -0400
Subject: [PATCH 4/4] New changes
---
examples/community-detection.ipynb | 27 ++++++++-------------------
1 file changed, 8 insertions(+), 19 deletions(-)
diff --git a/examples/community-detection.ipynb b/examples/community-detection.ipynb
index bc3aa1b29..e288d8329 100644
--- a/examples/community-detection.ipynb
+++ b/examples/community-detection.ipynb
@@ -66,26 +66,15 @@
"metadata": {},
"outputs": [],
"source": [
- "# # Replace with the actual connection URI and credentials\n",
- "NEO4J_CONNECTION_URI = \"bolt://54.152.132.224:7687\"\n",
- "NEO4J_USERNAME = \"neo4j\"\n",
- "NEO4J_PASSWORD = \"scissors-hoists-tastes\"\n",
+ "NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n",
+ "NEO4J_AUTH = None\n",
+ "if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n",
+ " NEO4J_AUTH = (\n",
+ " os.environ.get(\"NEO4J_USER\"),\n",
+ " os.environ.get(\"NEO4J_PASSWORD\"),\n",
+ " )\n",
"\n",
- "# Client instantiation\n",
- "gds = GraphDataScience(\n",
- " NEO4J_CONNECTION_URI,\n",
- " auth=(NEO4J_USERNAME, NEO4J_PASSWORD)\n",
- ")\n",
- "\n",
- "# NEO4J_URI = os.environ.get(\"NEO4J_URI\", \"bolt://localhost:7687\")\n",
- "# NEO4J_AUTH = None\n",
- "# if os.environ.get(\"NEO4J_USER\") and os.environ.get(\"NEO4J_PASSWORD\"):\n",
- "# NEO4J_AUTH = (\n",
- "# os.environ.get(\"NEO4J_USER\"),\n",
- "# os.environ.get(\"NEO4J_PASSWORD\"),\n",
- "# )\n",
- "\n",
- "# gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)"
+ "gds = GraphDataScience(NEO4J_URI, auth=NEO4J_AUTH)"
]
},
{