From 3b895e0ad4dd33c113b3fab54de5d2cbd355e5be Mon Sep 17 00:00:00 2001 From: Jane Wong <78888328+janecww@users.noreply.github.com> Date: Wed, 14 Jun 2023 15:30:48 +0800 Subject: [PATCH] Delete Codes/Task 4 directory --- .../Part_B_Stock_Anomaly_Detection.ipynb | 2061 ------------- ..._Anomaly_Associated_Events_Analysis_.ipynb | 2591 ----------------- 2 files changed, 4652 deletions(-) delete mode 100644 Codes/Task 4/Part_B_Stock_Anomaly_Detection.ipynb delete mode 100644 Codes/Task 4/Part_C_Stock_Anomaly_Associated_Events_Analysis_.ipynb diff --git a/Codes/Task 4/Part_B_Stock_Anomaly_Detection.ipynb b/Codes/Task 4/Part_B_Stock_Anomaly_Detection.ipynb deleted file mode 100644 index d2717c8..0000000 --- a/Codes/Task 4/Part_B_Stock_Anomaly_Detection.ipynb +++ /dev/null @@ -1,2061 +0,0 @@ -{ - "nbformat": 4, - "nbformat_minor": 0, - "metadata": { - "colab": { - "provenance": [], - "include_colab_link": true - }, - "kernelspec": { - "name": "python3", - "display_name": "Python 3" - }, - "language_info": { - "name": "python" - }, - "widgets": { - "application/vnd.jupyter.widget-state+json": { - "2d12c002ccb64f0faf09a1dce1e103b0": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HBoxModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HBoxModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HBoxView", - "box_style": "", - "children": [ - "IPY_MODEL_3713652868294ce1876c9d7a04e3ce42", - "IPY_MODEL_bc105f4b911e4eeeb4e0c5d3246a068a", - "IPY_MODEL_a90550a9ce124f089e5baa9c850fef6f" - ], - "layout": "IPY_MODEL_ad002fdb6b5e41eeaa7e2cd02ccba541" - } - }, - "3713652868294ce1876c9d7a04e3ce42": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_1bbd3415268d44edb3ba7fdce43b816d", - "placeholder": "​", - "style": "IPY_MODEL_6199e690822a44d19623d7febd3c5e58", - "value": "100%" - } - }, - "bc105f4b911e4eeeb4e0c5d3246a068a": { - "model_module": "@jupyter-widgets/controls", - "model_name": "FloatProgressModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "FloatProgressModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "ProgressView", - "bar_style": "success", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_aface8e24692405baaa4e13c356d199f", - "max": 64, - "min": 0, - "orientation": "horizontal", - "style": "IPY_MODEL_ee66fb56cee04a1487557dfc74f1ccad", - "value": 64 - } - }, - "a90550a9ce124f089e5baa9c850fef6f": { - "model_module": "@jupyter-widgets/controls", - "model_name": "HTMLModel", - "model_module_version": "1.5.0", - "state": { - "_dom_classes": [], - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "HTMLModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/controls", - "_view_module_version": "1.5.0", - "_view_name": "HTMLView", - "description": "", - "description_tooltip": null, - "layout": "IPY_MODEL_901b687c700c4bd9bbcc07f793eb72ca", - "placeholder": "​", - "style": "IPY_MODEL_bd7e7b51165946a5834fd688fdc6686c", - "value": " 64/64 [01:05<00:00, 4.10s/it]" - } - }, - "ad002fdb6b5e41eeaa7e2cd02ccba541": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "1bbd3415268d44edb3ba7fdce43b816d": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "6199e690822a44d19623d7febd3c5e58": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - }, - "aface8e24692405baaa4e13c356d199f": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "ee66fb56cee04a1487557dfc74f1ccad": { - "model_module": "@jupyter-widgets/controls", - "model_name": "ProgressStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "ProgressStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "bar_color": null, - "description_width": "" - } - }, - "901b687c700c4bd9bbcc07f793eb72ca": { - "model_module": "@jupyter-widgets/base", - "model_name": "LayoutModel", - "model_module_version": "1.2.0", - "state": { - "_model_module": "@jupyter-widgets/base", - "_model_module_version": "1.2.0", - "_model_name": "LayoutModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "LayoutView", - "align_content": null, - "align_items": null, - "align_self": null, - "border": null, - "bottom": null, - "display": null, - "flex": null, - "flex_flow": null, - "grid_area": null, - "grid_auto_columns": null, - "grid_auto_flow": null, - "grid_auto_rows": null, - "grid_column": null, - "grid_gap": null, - "grid_row": null, - "grid_template_areas": null, - "grid_template_columns": null, - "grid_template_rows": null, - "height": null, - "justify_content": null, - "justify_items": null, - "left": null, - "margin": null, - "max_height": null, - "max_width": null, - "min_height": null, - "min_width": null, - "object_fit": null, - "object_position": null, - "order": null, - "overflow": null, - "overflow_x": null, - "overflow_y": null, - "padding": null, - "right": null, - "top": null, - "visibility": null, - "width": null - } - }, - "bd7e7b51165946a5834fd688fdc6686c": { - "model_module": "@jupyter-widgets/controls", - "model_name": "DescriptionStyleModel", - "model_module_version": "1.5.0", - "state": { - "_model_module": "@jupyter-widgets/controls", - "_model_module_version": "1.5.0", - "_model_name": "DescriptionStyleModel", - "_view_count": null, - "_view_module": "@jupyter-widgets/base", - "_view_module_version": "1.2.0", - "_view_name": "StyleView", - "description_width": "" - } - } - } - } - }, - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "6QeMfKCzuY9q" - }, - "outputs": [], - "source": [ - "from statsmodels.tsa.stattools import adfuller\n", - "from statsmodels.graphics.tsaplots import plot_acf,plot_pacf\n", - "import statsmodels.api as sm\n", - "from statsmodels.tsa.arima_model import ARIMA\n", - "from statsmodels.tsa.arima_model import ARMA\n", - "from statsmodels.tools.parallel import parallel_func as Parallel\n", - "\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "from matplotlib.pylab import rcParams\n", - "import matplotlib.pyplot as plt\n", - "import plotly.express as px\n", - "import plotly.graph_objects as go\n", - "from tqdm import tqdm_notebook\n", - "import math\n", - "import numpy as np\n", - "import pandas as pd\n", - "\n", - "from itertools import product\n", - "import warnings\n", - "warnings.filterwarnings('ignore')\n", - "\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "from sklearn.datasets import fetch_20newsgroups\n", - "from sklearn.decomposition import LatentDirichletAllocation\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.decomposition import PCA\n", - "from sklearn.covariance import EllipticEnvelope\n", - "from sklearn.ensemble import IsolationForest" - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "l6e8Wwu_upH_" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "#Import Monthly Index Data" - ], - "metadata": { - "id": "z_ZFR4hTuzXi" - } - }, - { - "cell_type": "code", - "source": [ - "df = pd.read_csv('Hang Seng Index_Monthly.csv')\n", - "df = df[['Date','Close']]\n", - "df" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 423 - }, - "id": "98ysjm-Bu12c", - "outputId": "c9ebfee0-55dc-4085-f13a-80328bf09bd9" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " Date Close\n", - "0 2017-01-01 23360.779297\n", - "1 2017-02-01 23740.730469\n", - "2 2017-03-01 24111.589844\n", - "3 2017-04-01 24615.130859\n", - "4 2017-05-01 25660.650391\n", - ".. ... ...\n", - "64 2022-05-01 21415.199219\n", - "65 2022-06-01 21859.789063\n", - "66 2022-07-01 20156.509766\n", - "67 2022-08-01 19954.390625\n", - "68 2022-09-01 17222.830078\n", - "\n", - "[69 rows x 2 columns]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
DateClose
02017-01-0123360.779297
12017-02-0123740.730469
22017-03-0124111.589844
32017-04-0124615.130859
42017-05-0125660.650391
.........
642022-05-0121415.199219
652022-06-0121859.789063
662022-07-0120156.509766
672022-08-0119954.390625
682022-09-0117222.830078
\n", - "

69 rows × 2 columns

\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 177 - } - ] - }, - { - "cell_type": "code", - "source": [ - "df.info()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Kvjxc_6VvELA", - "outputId": "c646e25d-4bb0-4d4f-e137-e601d54e3170" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "\n", - "RangeIndex: 69 entries, 0 to 68\n", - "Data columns (total 2 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 Date 69 non-null object \n", - " 1 Close 69 non-null float64\n", - "dtypes: float64(1), object(1)\n", - "memory usage: 1.2+ KB\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "# Convert Date to datetime\n", - "df['Date']=pd.to_datetime(df['Date'])\n", - "df.set_index('Date',inplace=True)\n", - "df" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 455 - }, - "id": "D-_u73ArvNwt", - "outputId": "a3c7953d-d1c5-4db1-9518-7e55e134d953" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " Close\n", - "Date \n", - "2017-01-01 23360.779297\n", - "2017-02-01 23740.730469\n", - "2017-03-01 24111.589844\n", - "2017-04-01 24615.130859\n", - "2017-05-01 25660.650391\n", - "... ...\n", - "2022-05-01 21415.199219\n", - "2022-06-01 21859.789063\n", - "2022-07-01 20156.509766\n", - "2022-08-01 19954.390625\n", - "2022-09-01 17222.830078\n", - "\n", - "[69 rows x 1 columns]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
Close
Date
2017-01-0123360.779297
2017-02-0123740.730469
2017-03-0124111.589844
2017-04-0124615.130859
2017-05-0125660.650391
......
2022-05-0121415.199219
2022-06-0121859.789063
2022-07-0120156.509766
2022-08-0119954.390625
2022-09-0117222.830078
\n", - "

69 rows × 1 columns

\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 179 - } - ] - }, - { - "cell_type": "code", - "source": [ - "fig = px.line(df, y= 'Close', title='Stock Close Price Changes', template = 'plotly_dark')\n", - "fig.show()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "id": "aeIQxX__w94u", - "outputId": "b960158a-56df-4d99-b587-09272e55aa47" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "phJWrCB6yhb1" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "markdown", - "source": [ - "![newplot.png]()" - ], - "metadata": { - "id": "2LuJGP3wNbNX" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Augmented Dickey-Fuller test to check for dataset stationarity\n", - "- Augmented Dickey-Fuller unit root test.\n", - "\n", - "The Augmented Dickey-Fuller test can be used to test for a unit root in a univariate process in the presence of serial correlation.\n", - "\n" - ], - "metadata": { - "id": "KT6i83iNy_nG" - } - }, - { - "cell_type": "code", - "source": [ - "passing_data=adfuller(df['Close'])\n", - "passing_data" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "n7rzPtiUzF4T", - "outputId": "5e6f964e-a83f-4509-f7d0-b86a31327510" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "(-0.2577016009143149,\n", - " 0.9312850791504195,\n", - " 1,\n", - " 67,\n", - " {'1%': -3.5319549603840894,\n", - " '5%': -2.905755128523123,\n", - " '10%': -2.5903569458676765},\n", - " 987.4529587262712)" - ] - }, - "metadata": {}, - "execution_count": 181 - } - ] - }, - { - "cell_type": "code", - "source": [ - "type(df['Close'])" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "n-LMOMcb1bgR", - "outputId": "8065b670-df7e-429d-9616-a26bc6d06b11" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "pandas.core.series.Series" - ] - }, - "metadata": {}, - "execution_count": 182 - } - ] - }, - { - "cell_type": "code", - "source": [ - "def adf_test(series):\n", - " result=adfuller(series)\n", - " labels = ['Test parameters', 'p-value','#Lags Used','Num of Observations Used']\n", - " for value,label in zip(result,labels):\n", - " print(label+' : '+str(value) )\n", - " if result[1] <= 0.05:\n", - " print(\"Dataset is stationary\")\n", - " else:\n", - " print(\"Dataset is non-stationary \")\n", - "\n", - "adf_test(df['Close'])" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Obm8Ed3803E9", - "outputId": "0b2550b2-281f-46ed-ce22-5d2de75d0b53" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Test parameters : -0.2577016009143149\n", - "p-value : 0.9312850791504195\n", - "#Lags Used : 1\n", - "Num of Observations Used : 67\n", - "Dataset is non-stationary \n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "- p-value: 0.9312850791504195\n", - "- This number is greater than 0.05. It implies that the time series is non-stationary." - ], - "metadata": { - "id": "YvKsI8mB29oP" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Transformation to Stationary Time Series Using Differencing\n", - "The observations in a stationary time series are not dependent on time.\n", - "\n", - "Time series are stationary if they do not have trend or seasonal effects. Summary statistics calculated on the time series are consistent over time, like the mean or the variance of the observations.\n", - "\n", - "When a time series is stationary, it can be easier to model. Statistical modeling methods assume or require the time series to be stationary." - ], - "metadata": { - "id": "cxfqqNXvLYgs" - } - }, - { - "cell_type": "code", - "source": [ - "def differencing(data, column, order):\n", - " #difference that series at lag-12 because our data were collected monthly.\n", - " data['Close_diff'] = data[column].diff(order)\n", - " data['Close_diff'].fillna(data['Close_diff'].mean(), inplace=True)\n", - " #data['Close_diff']=df['Close']-df['Close'].shift(12)\n", - " #data['Close_diff'].fillna(data['Close_diff'].mean(), inplace=True)\n", - " return data\n", - "differenced_df = differencing(df, 'Close',1)\n", - "differenced_df" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 455 - }, - "id": "uCt9NQK2Kj3b", - "outputId": "5f4a9bc8-aa3c-41cb-cf40-abe84e7dd099" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - " Close Close_diff\n", - "Date \n", - "2017-01-01 23360.779297 -90.263959\n", - "2017-02-01 23740.730469 379.951172\n", - "2017-03-01 24111.589844 370.859375\n", - "2017-04-01 24615.130859 503.541015\n", - "2017-05-01 25660.650391 1045.519532\n", - "... ... ...\n", - "2022-05-01 21415.199219 325.808594\n", - "2022-06-01 21859.789063 444.589844\n", - "2022-07-01 20156.509766 -1703.279297\n", - "2022-08-01 19954.390625 -202.119141\n", - "2022-09-01 17222.830078 -2731.560547\n", - "\n", - "[69 rows x 2 columns]" - ], - "text/html": [ - "\n", - "
\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
CloseClose_diff
Date
2017-01-0123360.779297-90.263959
2017-02-0123740.730469379.951172
2017-03-0124111.589844370.859375
2017-04-0124615.130859503.541015
2017-05-0125660.6503911045.519532
.........
2022-05-0121415.199219325.808594
2022-06-0121859.789063444.589844
2022-07-0120156.509766-1703.279297
2022-08-0119954.390625-202.119141
2022-09-0117222.830078-2731.560547
\n", - "

69 rows × 2 columns

\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - "
\n", - " " - ] - }, - "metadata": {}, - "execution_count": 188 - } - ] - }, - { - "cell_type": "code", - "source": [ - "adf_test(differenced_df['Close_diff'])" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "SQQFUMGvMaAx", - "outputId": "8d5bc8df-7647-4060-807e-e872843c70a3" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - "Test parameters : -9.359697510715247\n", - "p-value : 7.883737480503631e-16\n", - "#Lags Used : 0\n", - "Num of Observations Used : 68\n", - "Dataset is stationary\n" - ] - } - ] - }, - { - "cell_type": "code", - "source": [ - "fig = px.line(differenced_df, y= 'Close_diff', title='Stock Close Price Changes After Differencing', template = 'plotly_dark')\n", - "fig.show()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "id": "zDAhwN8bNWGW", - "outputId": "ff90cd60-182d-4886-c8ab-3df7ae8b9261" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "![newplot (1).png]()" - ], - "metadata": { - "id": "g9QNkYPSNfC5" - } - }, - { - "cell_type": "markdown", - "source": [ - "# Implementing a ARIMA (Autoregressive Integrated Moving Average) Model\n", - "- to better understand the data or to predict future points in the series.\n" - ], - "metadata": { - "id": "Oi19dgnwRM_-" - } - }, - { - "cell_type": "markdown", - "source": [ - "## Select the optimal order using AIC" - ], - "metadata": { - "id": "zUMzIRr2lAXI" - } - }, - { - "cell_type": "code", - "source": [ - "def optimize_ARIMA(order_list, exog):\n", - " \"\"\"\n", - " Return dataframe with parameters and corresponding AIC\n", - "\n", - " order_list - list with (p, d, q) tuples\n", - " exog - the exogenous variable\n", - " \"\"\"\n", - "\n", - " results = []\n", - "\n", - " for order in tqdm_notebook(order_list):\n", - " try:\n", - " model = ARIMA(exog, order=order).fit(disp=-1)\n", - " except:\n", - " continue\n", - "\n", - " aic = model.aic\n", - " results.append([order, model.aic])\n", - "\n", - " result_df = pd.DataFrame(results)\n", - " result_df.columns = ['(p, d, q)', 'AIC']\n", - " #Sort in ascending order, lower AIC is better\n", - " result_df = result_df.sort_values(by='AIC', ascending=True).reset_index(drop=True)\n", - "\n", - " return result_df" - ], - "metadata": { - "id": "bQ1NBJKCe9ws" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "ps = range(0, 8, 1)\n", - "d = 1\n", - "qs = range(0, 8, 1)\n", - "# Create a list with all possible combination of parameters\n", - "parameters = product(ps, qs)\n", - "parameters_list = list(parameters)\n", - "order_list = []\n", - "for each in parameters_list:\n", - " each = list(each)\n", - " each.insert(1, 1)\n", - " each = tuple(each)\n", - " order_list.append(each)\n", - "\n", - "result_df = optimize_ARIMA(order_list, exog= df['Close_diff'])\n", - "result_df" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000, - "referenced_widgets": [ - "2d12c002ccb64f0faf09a1dce1e103b0", - "3713652868294ce1876c9d7a04e3ce42", - "bc105f4b911e4eeeb4e0c5d3246a068a", - "a90550a9ce124f089e5baa9c850fef6f", - "ad002fdb6b5e41eeaa7e2cd02ccba541", - "1bbd3415268d44edb3ba7fdce43b816d", - "6199e690822a44d19623d7febd3c5e58", - "aface8e24692405baaa4e13c356d199f", - "ee66fb56cee04a1487557dfc74f1ccad", - "901b687c700c4bd9bbcc07f793eb72ca", - "bd7e7b51165946a5834fd688fdc6686c" - ] - }, - "id": "2N8bwlqmgh0l", - "outputId": "7f7b6549-866c-4f6c-8a21-a912cda578fc" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/plain": [ - " 0%| | 0/64 [00:00\n", - "
\n", - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
(p, d, q)AIC
0(1, 1, 1)1173.487398
1(2, 1, 1)1173.782953
2(1, 1, 2)1174.070595
3(0, 1, 3)1174.126408
4(0, 1, 2)1174.267185
5(0, 1, 1)1174.458121
6(2, 1, 2)1175.781483
7(3, 1, 1)1175.781844
8(0, 1, 4)1175.993243
9(3, 1, 2)1177.455039
10(4, 1, 1)1177.757546
11(0, 1, 5)1177.791493
12(3, 1, 3)1178.185211
13(4, 1, 2)1179.382167
14(0, 1, 6)1179.727750
15(5, 1, 1)1179.743136
16(5, 1, 2)1181.345726
17(6, 1, 1)1181.725322
18(0, 1, 7)1181.725832
19(5, 1, 3)1183.357599
20(7, 1, 1)1183.613455
21(7, 1, 2)1184.133557
22(3, 1, 0)1184.463428
23(4, 1, 0)1184.717132
24(7, 1, 3)1184.986355
25(5, 1, 0)1185.454265
26(2, 1, 0)1186.435658
27(6, 1, 0)1186.771205
28(7, 1, 0)1187.679874
29(1, 1, 0)1190.687542
30(0, 1, 0)1227.389901
\n", - "
\n", - " \n", - " \n", - " \n", - "\n", - " \n", - "
\n", - " \n", - " " - ] - }, - "metadata": {}, - "execution_count": 192 - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Model Training" - ], - "metadata": { - "id": "GA3lBkdClDrY" - } - }, - { - "cell_type": "code", - "source": [ - "def find_anomalies(squared_errors):\n", - " threshold = np.mean(squared_errors) + np.std(squared_errors)\n", - " predictions = (squared_errors >= threshold).astype(int)\n", - " return predictions, threshold" - ], - "metadata": { - "id": "jU31jW8WgwEn" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "model = ARIMA(differenced_df['Close_diff'], order=(1, 1, 1))\n", - "arma_fit = model.fit()\n", - "squared_errors = arma_fit.resid ** 2\n", - "predictions, threshold = find_anomalies(squared_errors)" - ], - "metadata": { - "id": "MvOcQa4BiZXV" - }, - "execution_count": null, - "outputs": [] - }, - { - "cell_type": "code", - "source": [ - "#0 = normal observations ; 1 = anomalous observations\n", - "df['Anomaly'] = predictions\n", - "print(df)" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "s_cDYDpgimqd", - "outputId": "01a56090-6dd7-4b1d-9662-f15d97c93506" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "stream", - "name": "stdout", - "text": [ - " Close Close_diff Anomaly\n", - "Date \n", - "2017-01-01 23360.779297 -90.263959 NaN\n", - "2017-02-01 23740.730469 379.951172 0.0\n", - "2017-03-01 24111.589844 370.859375 0.0\n", - "2017-04-01 24615.130859 503.541015 0.0\n", - "2017-05-01 25660.650391 1045.519532 0.0\n", - "... ... ... ...\n", - "2022-05-01 21415.199219 325.808594 0.0\n", - "2022-06-01 21859.789063 444.589844 0.0\n", - "2022-07-01 20156.509766 -1703.279297 0.0\n", - "2022-08-01 19954.390625 -202.119141 0.0\n", - "2022-09-01 17222.830078 -2731.560547 1.0\n", - "\n", - "[69 rows x 3 columns]\n" - ] - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "## Result Illustration" - ], - "metadata": { - "id": "SjGYmVfclHfi" - } - }, - { - "cell_type": "code", - "source": [ - "#Dont run first\n", - "import plotly.graph_objects as go\n", - "# create list of outlier_dates\n", - "\n", - "\n", - "# plot value on y-axis and date on x-axis\n", - "fig = px.line(df, df.index, y=\"Close\", title='Monthly Prices of Hang Seng Index - UNSUPERVISED ANOMALY DETECTION (ARIMA Model)', template = 'plotly_dark')\n", - "fig.update_traces(line_color='#22577A')\n", - "\n", - "outlier_dates = df[df['Anomaly'] == 1].index\n", - "\n", - "\n", - "# obtain y value of anomalies to plot\n", - "y_values = [df.loc[i]['Close'] for i in outlier_dates]\n", - "fig.add_trace(go.Scatter(x=outlier_dates, y=y_values, mode = 'markers',\n", - " text=outlier_dates.astype(str).to_list(),\n", - " name = 'Anomaly',\n", - " marker=dict(color='red',size=10)))\n", - "\n", - "for i in range(len(outlier_dates)):\n", - " fig.add_annotation(x = outlier_dates[i], y = y_values[i] ,\n", - " text= outlier_dates.astype(str).to_list()[i][:7],\n", - " showarrow=False,\n", - " yshift=20)\n", - "\n", - "fig.show()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 542 - }, - "id": "c0ifCBxTjGd4", - "outputId": "a559f7de-adc7-4f66-a61b-0fb04548f88e" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "display_data", - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "
\n", - "
\n", - "\n", - "" - ] - }, - "metadata": {} - } - ] - }, - { - "cell_type": "markdown", - "source": [ - "![newplot (2).png]()" - ], - "metadata": { - "id": "_E07k8m1Nish" - } - }, - { - "cell_type": "code", - "source": [ - "outlier_dates.astype(str).to_list()" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "4U5ZJ4yrkYKa", - "outputId": "8c272df7-9ead-4581-d1e1-62566fba6530" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "['2018-01-01',\n", - " '2018-02-01',\n", - " '2018-06-01',\n", - " '2018-10-01',\n", - " '2019-01-01',\n", - " '2019-05-01',\n", - " '2019-08-01',\n", - " '2019-12-01',\n", - " '2020-03-01',\n", - " '2020-11-01',\n", - " '2021-07-01',\n", - " '2022-09-01']" - ] - }, - "metadata": {}, - "execution_count": 197 - } - ] - }, - { - "cell_type": "code", - "source": [ - "y_values" - ], - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "FTDq9RA6m3i5", - "outputId": "41351507-292d-4b70-ba8e-3a84bc3787e3" - }, - "execution_count": null, - "outputs": [ - { - "output_type": "execute_result", - "data": { - "text/plain": [ - "[32887.269531,\n", - " 24979.689453,\n", - " 28633.179688,\n", - " 28542.619141,\n", - " 26906.720703,\n", - " 26312.630859,\n", - " 23603.480469,\n", - " 25177.050781,\n", - " 26341.490234,\n", - " 28283.710938,\n", - " 25961.029297,\n", - " 23475.259766]" - ] - }, - "metadata": {}, - "execution_count": 156 - } - ] - }, - { - "cell_type": "code", - "source": [], - "metadata": { - "id": "CY-XwxCnq3RB" - }, - "execution_count": null, - "outputs": [] - } - ] -} \ No newline at end of file diff --git a/Codes/Task 4/Part_C_Stock_Anomaly_Associated_Events_Analysis_.ipynb b/Codes/Task 4/Part_C_Stock_Anomaly_Associated_Events_Analysis_.ipynb deleted file mode 100644 index 8a5dee3..0000000 --- a/Codes/Task 4/Part_C_Stock_Anomaly_Associated_Events_Analysis_.ipynb +++ /dev/null @@ -1,2591 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": { - "id": "view-in-github", - "colab_type": "text" - }, - "source": [ - "\"Open" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c33f96b", - "metadata": { - "id": "5c33f96b", - "outputId": "560c037e-253a-4a43-e44d-57a3051f96d6" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: plotly in c:\\users\\jane wong\\anaconda3\\lib\\site-packages (5.11.0)\n", - "Requirement already satisfied: tenacity>=6.2.0 in c:\\users\\jane wong\\anaconda3\\lib\\site-packages (from plotly) (8.0.1)\n", - "Note: you may need to restart the kernel to use updated packages.\n" - ] - } - ], - "source": [ - "pip install plotly" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a23ca7e", - "metadata": { - "id": "1a23ca7e", - "outputId": "f9f72459-d9d5-4187-acfc-0ac20ce8b75f" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Building prefix dict from C:\\Users\\Jane Wong\\OneDrive - HKUST Connect\\ESG\\dict.txt ...\n", - "Loading model from cache C:\\Users\\JANEWO~1\\AppData\\Local\\Temp\\jieba.u4dcdda7161e8cb4bbea6dffafa6acf12.cache\n", - "Loading model cost 1.510 seconds.\n", - "Prefix dict has been built successfully.\n" - ] - } - ], - "source": [ - "import statsmodels\n", - "import requests\n", - "from bs4 import BeautifulSoup\n", - "from selenium.webdriver import Chrome\n", - "from selenium.webdriver.common.keys import Keys\n", - "from selenium import webdriver\n", - "\n", - "import jieba\n", - "import jieba.analyse\n", - "jieba.initialize()\n", - "jieba.set_dictionary('dict.txt')\n", - "\n", - "import yfinance as yf\n", - "\n", - "import numpy as np\n", - "import matplotlib.pyplot as plt\n", - "import pandas as pd\n", - "import random\n", - "\n", - "import numpy as np\n", - "import tensorflow as tf\n", - "import pandas as pd\n", - "pd.options.mode.chained_assignment = None\n", - "import seaborn as sns\n", - "from matplotlib.pylab import rcParams\n", - "import matplotlib.pyplot as plt\n", - "import plotly\n", - "import plotly.express as px\n", - "import plotly.graph_objects as go\n", - "import datetime as dt\n", - "\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.feature_extraction.text import TfidfVectorizer\n", - "from sklearn.feature_extraction.text import CountVectorizer\n", - "import pyLDAvis\n", - "from __future__ import print_function\n", - "import pyLDAvis\n", - "import pyLDAvis.sklearn\n", - "pyLDAvis.enable_notebook()\n", - "from sklearn.datasets import fetch_20newsgroups\n", - "from sklearn.decomposition import LatentDirichletAllocation\n", - "from sklearn.preprocessing import StandardScaler\n", - "from sklearn.decomposition import PCA\n", - "from sklearn.covariance import EllipticEnvelope\n", - "from sklearn.ensemble import IsolationForest\n", - "\n", - "%matplotlib inline\n", - "\n", - "sns.set(style='whitegrid', palette='muted')\n", - "rcParams['figure.figsize'] = 14, 8\n", - "np.random.seed(0)\n", - "tf.random.set_seed(0)\n", - "\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab11c0c8", - "metadata": { - "id": "ab11c0c8", - "outputId": "f23983a2-c0bf-4889-af27-6151c35ae229" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "C:\\Users\\Jane Wong\\anaconda3\\lib\\site-packages\\matplotlib\\mpl-data\\matplotlibrc\n" - ] - } - ], - "source": [ - "import matplotlib\n", - "print(matplotlib.matplotlib_fname())" - ] - }, - { - "cell_type": "markdown", - "id": "e5f09f18", - "metadata": { - "id": "e5f09f18" - }, - "source": [ - "# Import Preprocessed Headlines" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef898947", - "metadata": { - "id": "ef898947", - "outputId": "3aac0a0c-75f4-4c63-cd91-65005c68f520" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
dateheadlinesegmented headlinekeywords
02017-01-01房委會綠建環評社區獲鉑金評級房委會 綠建 環評 社區 獲 鉑金 評級房委會 綠建 環評 社區 鉑金 評級
12017-01-01發祥街西項目獲鉑金評級發祥 街西 項目 獲 鉑金 評級發祥 街西 項目 鉑金 評級
22017-01-01潘基文告別寄語聯國為無聲者吶喊潘基文 告別 寄語 聯國為 無聲 者 吶喊告別 寄語 聯國為 無聲 吶喊 潘基文
32017-01-01潘基文告別聯合國潘基文 告別 聯合國告別 聯合國 潘基文
42017-01-01兩岸國際潘基文告別寄語聯國為無聲者吶喊兩岸 國際 潘基文 告別 寄語 聯國為 無聲 者 吶喊兩岸 國際 告別 寄語 聯國為 無聲 吶喊 潘基文
...............
3219412022-09-30俄羅斯指西方涉蓄意破壞北溪天然氣管道美國稱推測元兇為時尚早俄羅斯 指 西方 涉 蓄意 破壞 北溪 天然氣 管道 美國 稱 推測 元兇 為 時尚 早俄羅斯 破壞 天然氣 美國 推測 元兇 時尚 北溪 蓄意 管道 西方
3219422022-09-30美國辦太平洋島國峰會被指抗衡中國影響力中方重申不搞經濟脅迫美 國辦 太平洋 島國 峰會 被 指 抗衡 中國 影響力 中方 重申 不 搞 經濟 脅迫國辦 島國 峰會 中國 影響力 經濟 脅迫 抗衡 重申 中方 太平洋
3219432022-09-30澳門旅議會料國慶訪澳旅客增至三萬恢復內地團後旅客人數將再增澳門 旅 議會 料 國慶 訪澳 旅客 增至 三萬 恢復 內地 團後 旅客 人數 將再 增旅客 澳門 議會 國慶 訪澳 三萬 恢復 內地 團後 人數 將再 增至
3219442022-09-30謝安琪與母女檔拍親子裝大讚囡囡表現專業謝安琪 與 母女 檔 拍 親子裝 大 讚 囡 囡 表現 專業謝安琪 親子裝 表現 專業 母女
3219452022-09-30有狂風雷暴日間短暫時間有陽光最高度有 狂風 雷暴 日間 短暫 時間 有 陽光 最 高度狂風 日間 短暫 時間 陽光 雷暴 高度
\n", - "

321946 rows × 4 columns

\n", - "
" - ], - "text/plain": [ - " date headline \\\n", - "0 2017-01-01 房委會綠建環評社區獲鉑金評級 \n", - "1 2017-01-01 發祥街西項目獲鉑金評級 \n", - "2 2017-01-01 潘基文告別寄語聯國為無聲者吶喊 \n", - "3 2017-01-01 潘基文告別聯合國 \n", - "4 2017-01-01 兩岸國際潘基文告別寄語聯國為無聲者吶喊 \n", - "... ... ... \n", - "321941 2022-09-30 俄羅斯指西方涉蓄意破壞北溪天然氣管道美國稱推測元兇為時尚早 \n", - "321942 2022-09-30 美國辦太平洋島國峰會被指抗衡中國影響力中方重申不搞經濟脅迫 \n", - "321943 2022-09-30 澳門旅議會料國慶訪澳旅客增至三萬恢復內地團後旅客人數將再增 \n", - "321944 2022-09-30 謝安琪與母女檔拍親子裝大讚囡囡表現專業 \n", - "321945 2022-09-30 有狂風雷暴日間短暫時間有陽光最高度 \n", - "\n", - " segmented headline \\\n", - "0 房委會 綠建 環評 社區 獲 鉑金 評級 \n", - "1 發祥 街西 項目 獲 鉑金 評級 \n", - "2 潘基文 告別 寄語 聯國為 無聲 者 吶喊 \n", - "3 潘基文 告別 聯合國 \n", - "4 兩岸 國際 潘基文 告別 寄語 聯國為 無聲 者 吶喊 \n", - "... ... \n", - "321941 俄羅斯 指 西方 涉 蓄意 破壞 北溪 天然氣 管道 美國 稱 推測 元兇 為 時尚 早 \n", - "321942 美 國辦 太平洋 島國 峰會 被 指 抗衡 中國 影響力 中方 重申 不 搞 經濟 脅迫 \n", - "321943 澳門 旅 議會 料 國慶 訪澳 旅客 增至 三萬 恢復 內地 團後 旅客 人數 將再 增 \n", - "321944 謝安琪 與 母女 檔 拍 親子裝 大 讚 囡 囡 表現 專業 \n", - "321945 有 狂風 雷暴 日間 短暫 時間 有 陽光 最 高度 \n", - "\n", - " keywords \n", - "0 房委會 綠建 環評 社區 鉑金 評級 \n", - "1 發祥 街西 項目 鉑金 評級 \n", - "2 告別 寄語 聯國為 無聲 吶喊 潘基文 \n", - "3 告別 聯合國 潘基文 \n", - "4 兩岸 國際 告別 寄語 聯國為 無聲 吶喊 潘基文 \n", - "... ... \n", - "321941 俄羅斯 破壞 天然氣 美國 推測 元兇 時尚 北溪 蓄意 管道 西方 \n", - "321942 國辦 島國 峰會 中國 影響力 經濟 脅迫 抗衡 重申 中方 太平洋 \n", - "321943 旅客 澳門 議會 國慶 訪澳 三萬 恢復 內地 團後 人數 將再 增至 \n", - "321944 謝安琪 親子裝 表現 專業 母女 \n", - "321945 狂風 日間 短暫 時間 陽光 雷暴 高度 \n", - "\n", - "[321946 rows x 4 columns]" - ] - }, - "execution_count": 50, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Import Pre-processed headliens\n", - "headline_df = pd.read_csv('headline_df.csv')\n", - "headline_df = headline_df.drop(columns = ['Unnamed: 0'])\n", - "headline_df['date'] = pd.to_datetime(headline_df['date'])\n", - "headline_df" - ] - }, - { - "cell_type": "markdown", - "id": "ae684a95", - "metadata": { - "id": "ae684a95" - }, - "source": [ - "# Data Partition" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8dcbeb45", - "metadata": { - "id": "8dcbeb45" - }, - "outputs": [], - "source": [ - "sample_df = headline_df.groupby(\"date\").sample(frac = 0.01 , random_state=1).reset_index()\n", - "\n", - "# Anomaly1: 2018-01(up)\n", - "anomaly1 = headline_df[(headline_df['date']>='2017-12-01') & (headline_df['date']<='2017-12-31')]\n", - "\n", - "# Anomaly2: 2018-02(drop)\n", - "anomaly2 = headline_df[(headline_df['date']>='2018-1-01') & (headline_df['date']<='2018-1-31')]\n", - "\n", - "# Anomaly3: 2018-06(drop)\n", - "anomaly3 = headline_df[(headline_df['date']>='2018-5-01') & (headline_df['date']<='2018-5-31')]\n", - "\n", - "#Anomaly4: 2018-10(drop)\n", - "anomaly4 = headline_df[(headline_df['date']>='2018-9-01') & (headline_df['date']<='2018-9-30')]\n", - "\n", - "#Anomaly5: 2019-01(up)\n", - "anomaly5 = headline_df[(headline_df['date']>='2018-12-01') & (headline_df['date']<='2018-12-31')]\n", - "\n", - "#Anomaly6: 2019-05(drop)\n", - "anomaly6 = headline_df[(headline_df['date']>='2019-04-01') & (headline_df['date']<='2019-04-30')]\n", - "\n", - "#Anomaly7: 2019-08(drop)\n", - "anomaly7 = headline_df[(headline_df['date']>='2019-07-01') & (headline_df['date']<='2019-07-31')]\n", - "\n", - "#Anomaly8: 2019-12(up)\n", - "anomaly8 = headline_df[(headline_df['date']>='2019-11-01') & (headline_df['date']<='2019-11-30')]\n", - "\n", - "#Anomaly9: 2020-03(drop)\n", - "anomaly9 = headline_df[(headline_df['date']>='2020-02-01') & (headline_df['date']<='2020-02-29')]\n", - "\n", - "#Anomaly10: 2020-11(up)\n", - "anomaly10 = headline_df[(headline_df['date']>='2020-10-01') & (headline_df['date']<='2020-10-31')]\n", - "\n", - "#Anomaly11: 2021-07(drop)\n", - "anomaly11 = headline_df[(headline_df['date']>='2021-06-01') & (headline_df['date']<='2021-06-30')]\n", - "\n", - "#Anomaly12: 2022-09(drop)\n", - "anomaly12 = headline_df[(headline_df['date']>='2020-08-01') & (headline_df['date']<='2020-08-31')]\n", - "\n", - "positive_anomaly = pd.concat([anomaly1, anomaly5, anomaly8, anomaly10])\n", - "negative_anomaly = pd.concat([anomaly2 , anomaly3 ,anomaly4 , anomaly6 , anomaly7 , anomaly9 , anomaly11 , anomaly12])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "40e5343e", - "metadata": { - "id": "40e5343e", - "outputId": "0168385a-16bc-40c8-e054-89acb9735e5c" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
indexdateheadlinesegmented headlinekeywords
0222017-01-01中國今發布南沙海洋環境預報中國 今發布 南沙 海洋環境 預報中國 今發布 海洋環境 預報 南沙
11132017-01-03辦理低保豈能成吃拿卡要的藉口辦理 低保 豈能 成 吃拿卡要 的 藉口吃拿卡要 辦理 豈能 藉口 低保
22462017-01-04國立傳統藝術中心颱風影響工程對外開放時程延後國立 傳統 藝術 中心 颱風 影響 工程 對外開放 時程 延後國立 傳統 藝術 颱風 影響 對外開放 時程 延後 工程 中心
33352017-01-05六成高空氣污染日數受內地影響環保署不排除霧霾襲港六成 高 空氣污染 日數 受 內地 影響 環保署 不 排除 霧 霾 襲港空氣污染 日數 內地 影響 環保署 襲港 六成 排除
43602017-01-06中國最安全城市排行榜香港第一拉薩第二中國 最 安全 城市 排行榜 香港 第一 拉薩 第二中國 拉薩 排行榜 香港 安全 第二 第一 城市
..................
32113212982022-09-28李家超指當局銳意發展創科注入新經濟動力李家 超指 當局 銳意 發展 創科 注入 新 經濟 動力超指 當局 銳意 發展 創科 經濟 動力 李家 注入
32123215432022-09-29級颶風撲佛州萬人疏散級 颶風 撲 佛州 萬人 疏散颶風 佛州 萬人 疏散
32133215522022-09-29綠色和平極端酷熱工友極危險拒絕紙老虎保障綠色 和平 極端 酷熱 工友 極 危險 拒絕 紙老虎 保障綠色 極端 酷熱 危險 拒絕 紙老虎 工友 和平 保障
32143218402022-09-30美國公布太平洋夥伴關係宣言所羅門群島簽署美國 公布 太平洋 夥伴關係 宣言 所羅門 群島 簽署美國 夥伴關係 所羅門 群島 簽署 宣言 太平洋 公布
32153218502022-09-30英皇集團加盟平台網上售戲飛捐支持植樹英皇 集團 加盟 平台 網上 售戲 飛捐 支持 植樹集團 網上 售戲 飛捐 植樹 英皇 加盟 平台 支持
\n", - "

3216 rows × 5 columns

\n", - "
" - ], - "text/plain": [ - " index date headline \\\n", - "0 22 2017-01-01 中國今發布南沙海洋環境預報 \n", - "1 113 2017-01-03 辦理低保豈能成吃拿卡要的藉口 \n", - "2 246 2017-01-04 國立傳統藝術中心颱風影響工程對外開放時程延後 \n", - "3 335 2017-01-05 六成高空氣污染日數受內地影響環保署不排除霧霾襲港 \n", - "4 360 2017-01-06 中國最安全城市排行榜香港第一拉薩第二 \n", - "... ... ... ... \n", - "3211 321298 2022-09-28 李家超指當局銳意發展創科注入新經濟動力 \n", - "3212 321543 2022-09-29 級颶風撲佛州萬人疏散 \n", - "3213 321552 2022-09-29 綠色和平極端酷熱工友極危險拒絕紙老虎保障 \n", - "3214 321840 2022-09-30 美國公布太平洋夥伴關係宣言所羅門群島簽署 \n", - "3215 321850 2022-09-30 英皇集團加盟平台網上售戲飛捐支持植樹 \n", - "\n", - " segmented headline keywords \n", - "0 中國 今發布 南沙 海洋環境 預報 中國 今發布 海洋環境 預報 南沙 \n", - "1 辦理 低保 豈能 成 吃拿卡要 的 藉口 吃拿卡要 辦理 豈能 藉口 低保 \n", - "2 國立 傳統 藝術 中心 颱風 影響 工程 對外開放 時程 延後 國立 傳統 藝術 颱風 影響 對外開放 時程 延後 工程 中心 \n", - "3 六成 高 空氣污染 日數 受 內地 影響 環保署 不 排除 霧 霾 襲港 空氣污染 日數 內地 影響 環保署 襲港 六成 排除 \n", - "4 中國 最 安全 城市 排行榜 香港 第一 拉薩 第二 中國 拉薩 排行榜 香港 安全 第二 第一 城市 \n", - "... ... ... \n", - "3211 李家 超指 當局 銳意 發展 創科 注入 新 經濟 動力 超指 當局 銳意 發展 創科 經濟 動力 李家 注入 \n", - "3212 級 颶風 撲 佛州 萬人 疏散 颶風 佛州 萬人 疏散 \n", - "3213 綠色 和平 極端 酷熱 工友 極 危險 拒絕 紙老虎 保障 綠色 極端 酷熱 危險 拒絕 紙老虎 工友 和平 保障 \n", - "3214 美國 公布 太平洋 夥伴關係 宣言 所羅門 群島 簽署 美國 夥伴關係 所羅門 群島 簽署 宣言 太平洋 公布 \n", - "3215 英皇 集團 加盟 平台 網上 售戲 飛捐 支持 植樹 集團 網上 售戲 飛捐 植樹 英皇 加盟 平台 支持 \n", - "\n", - "[3216 rows x 5 columns]" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sample_df" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47bcedf7", - "metadata": { - "id": "47bcedf7" - }, - "outputs": [], - "source": [ - "def vectorize(df):\n", - "\n", - " texts = df['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - " vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', use_idf=False)\n", - "\n", - "# apply transformation\n", - " tf = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - " tf_feature_names = vectorizer.get_feature_names()\n", - "\n", - " return tf, tf_feature_names\n", - "\n", - "sample_df = headline_df.groupby(\"date\").sample(frac = 0.01 , random_state=1).reset_index()\n", - "tf_sample, tf_feature_names_sample = vectorize(sample_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fb3e1e35", - "metadata": { - "id": "fb3e1e35", - "outputId": "f1e4163a-853c-4e80-b232-6e4505284b04" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Fitting 5 folds for each of 4 candidates, totalling 20 fits\n", - "Best Model's Params: {'learning_decay': 0.7, 'n_components': 2}\n", - "Best Log Likelihood Score: -10668.241981968877\n" - ] - } - ], - "source": [ - "from sklearn.model_selection import GridSearchCV\n", - "\n", - "# Options to try with our LDA\n", - "search_params = {\n", - " 'n_components': [2, 3, 4, 8],\n", - " 'learning_decay': [0.7]\n", - "}\n", - "\n", - "# Set up LDA\n", - "model = LatentDirichletAllocation(learning_method='online')\n", - "\n", - "# Try all of the options\n", - "gridsearch = GridSearchCV(model, param_grid=search_params, n_jobs=-1, verbose=1)\n", - "gridsearch.fit(tf_sample)\n", - "\n", - "print(\"Best Model's Params: \", gridsearch.best_params_)\n", - "print(\"Best Log Likelihood Score: \", gridsearch.best_score_)" - ] - }, - { - "cell_type": "markdown", - "id": "2f586351", - "metadata": { - "id": "2f586351" - }, - "source": [ - "# Anomaly Analysis" - ] - }, - { - "cell_type": "markdown", - "id": "2cf58c4e", - "metadata": { - "id": "2cf58c4e" - }, - "source": [ - "## Postive Anomaly" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "234e7c18", - "metadata": { - "id": "234e7c18", - "outputId": "b0620adf-9556-42e4-c057-56ab2990cc13" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 82, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = positive_anomaly['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_pos = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_namespos = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_pos)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_pos, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ecc0552b", - "metadata": { - "id": "ecc0552b", - "outputId": "fa8ca8c2-9f16-4263-ef46-956b70ffbb7d" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['發展', '中國', '香港', '合作', '經濟', '持續', '習近平', '國際', '企業', '印尼', '全球', '海嘯', '創新', '綠色', '環保', '增至', '世界', '論壇', '投資', '深圳', '金融', '科技', '國家', '項目', '舉行', '推動', '市場', '美國', '社會', '地震']\n", - "[101.2411081 96.0707096 59.42785118 57.54349857 51.37339908\n", - " 50.76843273 46.13342008 44.28069624 43.60323667 40.75336758\n", - " 40.19445033 39.68266857 39.33392443 39.19353101 38.84124136\n", - " 38.45761399 36.06319632 35.28402145 34.40511358 33.58012288\n", - " 33.46073164 33.06982484 32.07327046 31.32372394 29.07798231\n", - " 29.00120612 28.38383962 27.2009543 26.44828704 26.34378019]\n", - "['天文台', '颱風', '風球', '天氣', '香港', '信號', '沙德爾', '三號', '浪卡', '熱帶', '年度', '日本', '風暴', '氣旋', '澳門', '強風', '改發', '考慮', '失蹤', '明日', '新聞', '天晴', '生效', '八號', '本港', '維持', '乾燥', '威尼斯', '暴雨', '氣溫']\n", - "[150.47104956 105.88991908 84.68617347 71.90992239 71.86424153\n", - " 64.58738544 55.39163987 54.99363986 54.42447449 49.14438078\n", - " 48.62528323 46.6395833 46.1875637 45.96217368 45.84951221\n", - " 44.01028648 43.82392868 43.71010794 43.09741272 42.24091468\n", - " 41.72430846 38.65664204 38.60991495 38.58352297 38.22411116\n", - " 36.90388171 36.8190931 36.76832535 36.6946579 35.5732837 ]\n" - ] - } - ], - "source": [ - "def plot_top_words(model, feature_names, n_top_words):\n", - " for topic_idx, topic in enumerate(model.components_):\n", - " #ID\n", - " top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n", - " #Top Features\n", - " top_features = [feature_names[i] for i in top_features_ind]\n", - " print( top_features )\n", - " #Term Frequency\n", - " weights = topic[top_features_ind]\n", - " print( weights )\n", - "plot_top_words(lda_tfidf,tf_feature_namespos,30)" - ] - }, - { - "cell_type": "markdown", - "id": "0c5057bb", - "metadata": { - "id": "0c5057bb" - }, - "source": [ - "## Negative Anomaly" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e74bb35", - "metadata": { - "id": "3e74bb35", - "outputId": "3fe3a77c-94d5-4a49-8c6c-f2d2fb22d284" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 84, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = negative_anomaly['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_neg = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_namesneg = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_neg)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_neg, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eeb059ad", - "metadata": { - "id": "eeb059ad", - "outputId": "f6bbdb19-00fe-4606-a47c-513154505470" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "['山竹', '颱風', '日本', '中國', '香港', '機場', '航班', '取消', '關西', '政府', '影響', '風災', '襲港', '服務', '市民', '暴雨', '居民', '北海道', '吹襲', '強颱風', '經濟', '停課', '美國', '中心', '地震', '交通', '大阪', '恢復', '傾瀉', '風暴']\n", - "[506.38146773 391.57701053 300.57730284 227.96251346 224.9981018\n", - " 221.75379882 209.6725619 205.21691106 184.88193729 177.37250973\n", - " 158.0544783 155.8690724 147.58390469 147.27878976 137.56808612\n", - " 128.45550673 125.56659391 121.34726591 120.54969733 118.26279574\n", - " 118.25879508 117.38318561 115.80386446 115.72894442 115.37733298\n", - " 114.62226182 114.05090662 111.74960918 109.65405764 107.00338141]\n", - "['天文台', '天氣', '山竹', '颱風', '驟雨', '警告', '雷暴', '酷熱', '暴雨', '風球', '信號', '香港', '最高', '持續', '生效', '發展', '本港', '今年', '氣溫', '未來', '熱帶', '死亡', '澳門', '下午', '今日', '明日', '狂風', '改發', '多雲', '襲港']\n", - "[644.88958065 506.62575646 385.47540092 314.86206047 314.11464705\n", - " 313.21622594 307.8966413 305.52522511 287.7213258 285.60528845\n", - " 264.38707162 256.8883382 241.94238129 230.64453552 214.91650521\n", - " 204.68042101 187.79002707 184.47034908 182.62945358 176.20520685\n", - " 160.57601183 159.07470141 155.52578288 149.4592725 148.39980017\n", - " 147.34587385 145.60530085 144.10603682 141.75708252 139.30459805]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_namesneg,30)" - ] - }, - { - "cell_type": "markdown", - "id": "8a779c47", - "metadata": { - "id": "8a779c47" - }, - "source": [ - "## Anomaly1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "61b1aaf8", - "metadata": { - "id": "61b1aaf8" - }, - "outputs": [], - "source": [ - "texts = anomaly1['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_1 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names1 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1c0fa1d7", - "metadata": { - "id": "1c0fa1d7", - "outputId": "1c952b33-221f-4b82-e7d8-a64f31b137d9" - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Building prefix dict from C:\\Users\\Jane Wong\\OneDrive - HKUST Connect\\ESG\\dict.txt ...\n", - "Loading model from cache C:\\Users\\JANEWO~1\\AppData\\Local\\Temp\\jieba.u4dcdda7161e8cb4bbea6dffafa6acf12.cache\n", - "Loading model cost 1.502 seconds.\n", - "Prefix dict has been built successfully.\n" - ] - }, - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_1, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fcca99fe", - "metadata": { - "scrolled": true, - "id": "fcca99fe", - "outputId": "177bb6c2-b666-49ed-a2fb-1fe26e9ae632" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 244 75 6015 33 2278 4808 5467 2040 8104 1282 5026 6230 926 4020\n", - " 508 6152 2809 3103 2474 2469 5346 1918 2084 2790 809 5428 893 4174\n", - " 3776 1933]\n", - "['中國', '一路', '經濟', '一帶', '天氣', '海洋公園', '發展', '報告', '香港', '加價', '澳門', '聯合國', '全球', '明年', '今年', '美國', '工作', '影響', '安排', '安全', '生態', '國家', '增長', '崔世安', '億元', '疏散', '內地', '最高', '政府', '國際']\n", - "[14.93233549 14.65068642 14.38716795 14.27596208 12.21832112 11.08610891\n", - " 10.7183718 10.44307966 9.75482772 9.69973329 9.0128817 7.8936319\n", - " 7.58349759 7.37509395 7.24714627 6.95038934 6.92315806 6.50409473\n", - " 6.46599433 6.13037681 5.9135075 5.8652221 5.76699568 5.71426195\n", - " 5.69323246 5.60521885 5.59778677 5.58425532 5.53228233 5.49487096]\n", - "[6461 244 8104 5467 2321 2287 4528 1663 5026 8022 6039 7946 2004 8032\n", - " 4087 3497 2933 221 1733 1476 2080 4020 1933 6215 5126 926 1241 6330\n", - " 562 6532]\n", - "['菲律賓', '中國', '香港', '發展', '失蹤', '天秤', '死亡', '合作', '澳門', '風災', '綠色', '項目', '城市', '颱風', '智慧', '持續', '年度', '世界', '吹襲', '南部', '增至', '明年', '國際', '聖誕', '熱帶風暴', '全球', '創新', '至少', '企業', '藍圖']\n", - "[29.01099883 24.19881731 22.80510549 20.91110496 18.71793268 18.39089697\n", - " 18.32998645 17.80649145 17.03970118 16.25548684 15.15205325 15.10880485\n", - " 15.04809374 15.01868758 14.89535574 13.44233067 13.40508341 13.30042488\n", - " 13.17984564 12.48771085 12.35592089 12.07720772 11.95639465 11.88260897\n", - " 11.3967091 11.34163453 10.65901807 10.6421779 10.44609119 10.27861333]\n" - ] - } - ], - "source": [ - "def plot_top_words(model, feature_names, n_top_words):\n", - " for topic_idx, topic in enumerate(model.components_):\n", - " #ID\n", - " top_features_ind = topic.argsort()[: -n_top_words - 1 : -1]\n", - " #Top Features\n", - " top_features = [feature_names[i] for i in top_features_ind]\n", - " print( top_features )\n", - " #Term Frequency\n", - " weights = topic[top_features_ind]\n", - " print( weights )\n", - "\n", - "plot_top_words(lda_tfidf,tf_feature_names1,30)\n" - ] - }, - { - "cell_type": "markdown", - "id": "d454922e", - "metadata": { - "id": "d454922e" - }, - "source": [ - "## Anomaly2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c65c7915", - "metadata": { - "id": "c65c7915", - "outputId": "45f1b068-57f5-477d-b3dc-7f01384a1948" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 54, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly2['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_2 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names2 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_2)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_2, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "62e847d6", - "metadata": { - "id": "62e847d6", - "outputId": "f18c69f7-998d-46de-d5a4-2bcc35b1922c" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 245 5615 794 3551 6188 2763 1454 3977 1356 8430 1951 6350 933 1400\n", - " 6213 2043 1246 3488 558 2124 3832 1292 8340 5816 5129 5643 68 4179\n", - " 1640 3042]\n", - "['中國', '發展', '傾瀉', '持續', '經濟', '山泥', '南加州', '新聞', '北極', '香港', '國際', '美國', '全球', '十大', '綠色', '城市', '創新', '投資', '企業', '增至', '政策', '加州', '風暴', '社會', '澳門', '白皮書', '一路', '暴雨', '合作', '建設']\n", - "[42.49585984 30.54598518 19.1661267 18.76194515 18.4235489 17.6693858\n", - " 17.49090193 15.83786441 13.80432049 13.77007552 13.24290763 12.85796231\n", - " 12.66589547 12.58630029 12.4619346 12.2420816 11.76736737 11.62151361\n", - " 11.61599877 11.56307189 11.45511977 11.30966067 10.93050839 10.88353691\n", - " 10.62521803 10.40936445 10.38885073 10.37054678 10.35008198 10.32093277]\n", - "[2313 1507 8430 2100 2843 4695 4686 6213 1640 2070 3124 4344 4252 7849\n", - " 1978 2311 4804 4240 4176 4714 4179 1268 4074 505 5931 6533 4675 216\n", - " 8412 4336]\n", - "['天氣', '去年', '香港', '塞納河', '巴黎', '水位', '氣溫', '綠色', '合作', '基金', '影響', '李克強', '會議', '金融', '團結', '天文台', '法國', '最高', '暴漲', '水浸', '暴雨', '劉怡翔', '昂坪', '今年', '空氣', '臭氧濃度', '氣候', '世界', '首次', '本港']\n", - "[23.52751378 18.43157945 18.2663389 16.43821411 15.38096033 12.43983824\n", - " 11.89169719 11.2883255 11.28324272 10.70551796 9.58600373 9.4118763\n", - " 8.91168984 8.67257947 8.57305055 8.47952376 8.42495104 8.4239795\n", - " 8.02321497 7.86370952 7.75669537 7.73984769 7.38107224 7.34727143\n", - " 7.24762137 7.16451806 7.14188279 7.02470619 6.97310905 6.45148573]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names2,30)" - ] - }, - { - "cell_type": "markdown", - "id": "087b7c2c", - "metadata": { - "id": "087b7c2c" - }, - "source": [ - "## Anomaly3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7ad35f6b", - "metadata": { - "id": "7ad35f6b", - "outputId": "78becb31-aceb-4aba-8cf7-c38e47b1cddf" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 56, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly3['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_3 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names3 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_3)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_3, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0d70bf6d", - "metadata": { - "id": "0d70bf6d", - "outputId": "e3a556d5-6cae-4c3d-e229-6d032da7dfd0" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 5506 7477 805 6066 2012 11156 6694 8658 8381 6028 7820 6294\n", - " 559 10420 5217 4081 4412 10828 5507 3112 10824 10952 292 11027\n", - " 3612 3041 2633 325 5316 6264]\n", - "['暴雨', '發展', '佛誕', '死亡', '印度', '香港', '溫度', '至少', '美國', '正午', '科技', '沙塵暴', '亞洲', '金融', '新界', '影響', '成立', '雷雨', '暴雨成災', '失蹤', '雷暴', '項目', '世界', '風暴', '局部地區', '天氣', '地盤', '中國', '日間', '汶川']\n", - "[24.28740601 21.14360177 18.88467386 18.35668084 18.22759079 14.5546624\n", - " 14.06443187 12.86192157 12.64554883 12.1850168 11.30309085 10.82907425\n", - " 10.6882978 10.15619752 10.14174138 10.03907714 9.8357761 9.52841732\n", - " 9.32172913 9.27595875 8.93463286 8.91379079 8.87344842 8.80850052\n", - " 8.79972721 8.78237002 8.753038 8.59103156 8.44763888 8.38482953]\n", - "[10329 3041 3032 9422 11270 5578 8100 4656 668 11156 11221 325\n", - " 7309 6149 10048 3035 363 10467 7720 5570 11251 210 5577 3771\n", - " 10824 5717 3540 7477 5346 215]\n", - "['酷熱', '天氣', '天文台', '警告', '高溫', '最高', '紀錄', '持續', '今年', '香港', '驟雨', '中國', '生效', '氣溫', '連續', '天晴', '中暑', '錄得', '破紀錄', '最熱', '高度', '下午', '最長', '市區', '雷暴', '本港', '小時', '發展', '明日', '下周']\n", - "[111.2382204 87.42003248 56.73019548 54.16668249 44.15062768\n", - " 42.97947653 42.66026818 41.05118086 39.9255193 34.42157182\n", - " 33.30573076 31.33539454 31.15201748 31.09338257 30.62620406\n", - " 30.54574545 30.34491483 28.83935476 23.56543575 22.54559855\n", - " 21.76383381 21.47646041 21.4628062 21.4435854 19.62212761\n", - " 19.40473137 18.68333907 17.91688142 17.73795088 17.61849345]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names3,30)" - ] - }, - { - "cell_type": "markdown", - "id": "943b2d14", - "metadata": { - "id": "943b2d14" - }, - "source": [ - "## Anomaly4" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4bf47677", - "metadata": { - "id": "4bf47677", - "outputId": "6b37f93f-f676-4849-88e3-42062cff8fd4" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 58, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly4['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_4 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names4 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_4)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_4, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60395f70", - "metadata": { - "id": "60395f70", - "outputId": "bf8f500d-a3f5-47af-ccaf-1c2c92d5c999" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[13045 14694 25809 21242 4864 26922 27189 7279 4286 782 9831 10299\n", - " 6419 6201 26962 25816 16482 15470 12959 6753 2470 6216 9100 26978\n", - " 5476 13193 9760 8848 18249 26874]\n", - "['日本', '機場', '關西', '航班', '取消', '颱風', '香港', '大阪', '北海道', '中國', '影響', '恢復', '地震', '國泰', '飛燕', '關閉', '滯留', '沖繩', '旅客', '增至', '停課', '國際', '巴士', '飛燕襲', '吹襲', '明日', '強颱風', '山竹', '發展', '風災']\n", - "[222.8873012 180.85941396 165.2196237 164.43213342 155.90591351\n", - " 130.96400274 125.75119972 103.35845911 90.8505733 75.8147143\n", - " 74.37972297 72.89083047 71.37575718 69.37214146 67.42851832\n", - " 65.39225781 57.44292264 54.82319061 53.42587854 53.11996002\n", - " 52.79890227 52.62454285 52.08152787 50.80162706 50.09151207\n", - " 49.97479682 46.81709525 45.8977747 44.05611777 42.92340789]\n", - "[ 8848 26922 7340 22200 9760 26883 16705 27189 2216 26929 7348 12384\n", - " 9146 5542 15298 18222 8725 26874 20662 12316 14003 24665 16142 5476\n", - " 2444 9831 26862 14087 21516 7191]\n", - "['山竹', '颱風', '天文台', '襲港', '強颱風', '風球', '澳門', '香港', '信號', '颶風', '天氣', '政府', '市民', '周日', '水浸', '登陸', '居民', '風災', '考慮', '改發', '本港', '過後', '清理', '吹襲', '停工', '影響', '風暴', '杏花', '菲律賓', '大澳']\n", - "[629.36590181 332.78570904 249.55923198 214.23465192 155.34990893\n", - " 142.6569857 129.09738752 128.30647763 125.90398591 109.30295786\n", - " 107.95409816 107.20837103 107.1757171 103.39142871 94.29401813\n", - " 88.95754895 87.038806 84.30804647 82.87663529 80.74462563\n", - " 80.41941527 77.74724117 75.17291346 74.29801265 73.58574976\n", - " 73.09725966 70.48596158 68.30434697 66.92146896 66.43370714]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names4,30)" - ] - }, - { - "cell_type": "markdown", - "id": "f91e7c07", - "metadata": { - "id": "f91e7c07" - }, - "source": [ - "## Anomaly5" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ac87e27b", - "metadata": { - "id": "ac87e27b", - "outputId": "b0a84f57-b49b-4c7a-abad-2f0350b497f3" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 60, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly5['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_5 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names5 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_5)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_5, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a81f2fd", - "metadata": { - "id": "8a81f2fd", - "outputId": "ec32ff15-4fe1-4d96-cfde-23646674c183" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[8462 2794 250 2331 1510 1899 4939 1419 4662 2329 6374 3728 2215 3851\n", - " 2118 3831 8094 6339 2965 6186 6293 3350 3517 6663 8045 527 3835 977\n", - " 4738 1922]\n", - "['香港', '山竹', '中國', '天氣', '印尼', '回顧', '海嘯', '十大', '死亡', '天文台', '聖誕', '搜尋', '大事', '政府', '增至', '改革開放', '除夕', '習近平', '年度', '綠色', '美國', '應對', '投資', '菲律賓', '防災', '今年', '攻略', '全球', '氣候變化', '國家']\n", - "[26.16350118 15.93219474 15.71192651 14.80553179 13.5922339 13.38647755\n", - " 12.46470025 12.29179706 10.89120018 10.68089521 10.15828423 10.10409692\n", - " 10.0310155 9.86769705 9.84936839 9.5722952 9.52388233 9.38129367\n", - " 9.10540175 8.91291438 8.80975938 8.56707131 8.51698975 8.28683973\n", - " 8.14802489 8.09461029 8.07364506 8.06091575 8.04323432 7.73563762]\n", - "[5631 4939 1510 5128 250 2965 1874 6164 4072 854 2789 2004 6523 4665\n", - " 577 2794 5181 1942 2815 3588 3075 977 2118 5590 5509 5596 4051 1175\n", - " 5651 6186]\n", - "['發展', '海嘯', '印尼', '漢字', '中國', '年度', '四川', '經濟', '日本', '傾瀉', '山泥', '地震', '至少', '死傷', '企業', '山竹', '火山', '國際', '峰會', '持續', '引發', '全球', '增至', '當局', '生活', '當選', '旅遊', '出席', '發表', '綠色']\n", - "[22.15468715 19.44111526 18.85954736 17.76023775 16.77314962 16.18458003\n", - " 16.01604573 15.82165993 15.58162015 14.69878035 13.67532406 12.06063331\n", - " 11.92966998 11.07906751 10.99972475 10.11194325 10.05897911 9.24536309\n", - " 9.03069368 8.95239066 8.10419508 8.09869622 7.60291212 7.33238136\n", - " 7.01679216 6.79966834 6.44421468 6.31693421 6.24547089 6.17028543]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names5,30)" - ] - }, - { - "cell_type": "markdown", - "id": "13759f3a", - "metadata": { - "id": "13759f3a" - }, - "source": [ - "## Anomaly6" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "98a9f6cb", - "metadata": { - "id": "98a9f6cb", - "outputId": "57169532-4b56-473f-dd5a-c044d45821b4" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 62, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly6['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_6 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names6 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_6)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_6, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "65fb8dfb", - "metadata": { - "id": "65fb8dfb", - "outputId": "9a13651d-9f12-4292-94ff-134f4b505c6c" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 5873 11500 3264 3256 6942 11928 9937 5944 7509 3328 11506 6797\n", - " 4342 6585 3098 3793 2223 4268 217 6481 752 4220 6050 7698\n", - " 3946 3267 4996 11658 6688 2974]\n", - "['暴雨', '雷暴', '天氣', '天文台', '深圳', '驟雨', '警告', '最高', '狂風', '失蹤', '雷雨', '洪水', '影響', '氣溫', '多雲', '小時', '取消', '引發', '下午', '死亡', '今年', '廣東', '未來', '生效', '工人', '天氣預報', '持續', '預警', '沖走', '增至']\n", - "[80.14730256 69.19841187 64.66454106 60.50300652 55.38318192 46.55523157\n", - " 41.31013584 39.55962752 35.19916911 35.03906302 31.75496529 29.66662545\n", - " 25.61764013 23.28611946 22.48946187 22.31720267 21.87365709 21.40332862\n", - " 21.36404942 21.1913542 20.60469945 20.35374226 20.03894952 19.61908213\n", - " 19.33341744 18.70838894 18.68532794 18.29370358 17.48382603 17.14043636]\n", - "[ 44 101 374 7879 9891 2355 2733 8900 1354 11857 8698 1181\n", - " 11953 5415 4996 4883 336 8672 3908 11639 752 8558 3912 2714\n", - " 8011 10050 8424 7665 8853 1865]\n", - "['一帶', '一路', '中國', '發展', '論壇', '合作', '國際', '習近平', '全球', '香港', '綠色', '傾瀉', '高峰', '政府', '持續', '投資', '世界', '經濟', '山泥', '項目', '今年', '紀錄', '山竹', '國家', '直播', '財經', '第二屆', '環保', '美國', '加強']\n", - "[56.58768554 55.61086546 37.75481332 36.77002623 30.6649202 29.46361465\n", - " 25.65629583 24.46413834 22.65977204 22.56990189 20.06190696 19.67327291\n", - " 17.40492453 17.375606 16.27091727 15.26718626 14.7050081 14.68826655\n", - " 13.73991043 13.46706544 12.75128864 12.58748866 12.54904234 12.49551197\n", - " 12.27651102 12.14734424 12.13557703 12.08846158 11.62826634 11.59658445]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names6,30)" - ] - }, - { - "cell_type": "markdown", - "id": "232c3478", - "metadata": { - "id": "232c3478" - }, - "source": [ - "## Anomaly7" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e9402a6c", - "metadata": { - "id": "e9402a6c", - "outputId": "6947f387-608b-415f-b530-4e8e33f38b1e" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 64, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly7['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_7 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names7 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_7)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_7, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8b441bea", - "metadata": { - "id": "8b441bea", - "outputId": "a3412a08-b18c-460a-eef0-15492cce9c5a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 3010 3016 11325 10882 11133 10391 7132 5528 9484 7329 11265 11371\n", - " 751 2884 6131 10990 11123 6974 3912 5644 1012 11360 7499 4628\n", - " 853 10758 6793 3610 4967 755]\n", - "['天文台', '天氣', '驟雨', '雷暴', '風球', '酷熱', '狂風', '最高', '警告', '生效', '香港', '高溫', '今日', '多雲', '氣溫', '韋帕', '風暴', '熱帶', '幾陣', '未來', '信號', '高度', '發展', '持續', '低壓', '陽光', '澳門', '局部地區', '改發', '今晚']\n", - "[81.08519802 78.62173906 65.88753956 62.51523084 51.24063036 50.12059143\n", - " 41.45734927 39.56309969 39.52550759 36.70317792 34.09420719 31.25184303\n", - " 29.83696526 29.68123032 29.54941549 28.54509464 27.77997059 27.39388054\n", - " 27.02390937 24.49619645 24.33638384 24.00789788 23.60669081 23.10737434\n", - " 22.7881612 22.61440445 22.41667241 22.1482223 21.93279047 20.11575493]\n", - "[ 5456 11147 386 5275 6022 1150 529 3083 471 3671 4628 9616\n", - " 7484 6399 1993 2164 7489 747 8733 2556 11058 6403 7446 8915\n", - " 104 6216 2090 2756 5457 5181]\n", - "['暴雨', '颱風', '中國', '日本', '死亡', '傾瀉', '九州', '失蹤', '丹娜', '山泥', '持續', '貴州', '登陸', '洪水', '印度', '台灣', '發出', '今年', '至少', '國際', '預警', '洪災', '疏散', '萬人', '一號', '江西', '受災', '增至', '暴雨成災', '新聞']\n", - "[75.47626912 46.83493384 35.09184757 34.66534396 32.72957909 30.75619462\n", - " 29.83179268 29.77055511 29.42701363 28.99534563 27.78779164 24.07798372\n", - " 23.66037522 23.28507005 23.17703697 22.99701561 21.79499139 21.75229491\n", - " 21.13749142 20.60162986 19.16275552 18.52108681 18.43076774 18.31931034\n", - " 18.00965439 17.73830918 17.67791105 17.47224198 17.34663494 16.98952617]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names7,30)" - ] - }, - { - "cell_type": "markdown", - "id": "1dd4ae84", - "metadata": { - "id": "1dd4ae84" - }, - "source": [ - "## Anomaly8" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71cf4371", - "metadata": { - "id": "71cf4371", - "outputId": "d12dccec-5194-4330-8383-b885b63f0677" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly8['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_8 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names8 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_8)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_8, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cc57c73", - "metadata": { - "id": "2cc57c73", - "outputId": "b78f8756-84ec-419e-924c-5a2fb8af8688" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 242 1685 3426 4772 5575 7092 1277 5285 2407 6237 6614 798 5924 2991\n", - " 220 3353 1258 415 4650 5108 2396 3322 3696 3650 4256 1261 1951 3681\n", - " 1779 4821]\n", - "['中國', '國際', '日本', '發展', '舉行', '香港', '即位', '綠色', '巡遊', '路障', '金融', '全球', '解放軍', '持續', '世界', '新聞', '博會', '亮相', '環保', '第二屆', '峰會', '新世界', '東京', '未來', '清理', '博覽會', '大灣', '李克強', '基金', '皇德仁']\n", - "[19.99700114 15.46635722 14.86275929 13.95727344 13.79984252 13.58969697\n", - " 12.61303871 11.60960773 11.08670637 10.54175131 10.22782664 10.01259199\n", - " 9.1530986 8.93475159 8.7794381 8.70973799 8.63842259 8.53602654\n", - " 8.52214091 8.22636029 7.87527106 7.36476435 7.35475793 7.27939246\n", - " 7.25872653 7.09571926 7.06346343 6.8890057 6.79453215 6.78373445]\n", - "[2084 4772 2991 7092 492 1401 3994 242 3991 2772 1615 798 1685 2928\n", - " 3522 6392 5307 1993 5555 348 1996 3523 1038 6004 1954 3968 5460 397\n", - " 5263 1672]\n", - "['威尼斯', '發展', '持續', '香港', '企業', '合作', '水災', '中國', '水浸', '意大利', '嚴重', '全球', '國際', '投資', '暴雨', '進入', '緊急狀態', '天晴', '至少', '乾燥', '天氣', '暴雨成災', '創新', '論壇', '大獎', '氣溫', '聯合國', '亞洲', '經濟', '國家']\n", - "[29.8303799 17.951432 17.48022947 14.49244388 13.39118086 12.01547138\n", - " 11.93337545 11.89174197 10.81969295 10.02906022 9.89846029 9.79035057\n", - " 9.65436342 9.4141742 9.37864623 9.08470956 9.01935327 8.83581734\n", - " 8.57789919 8.51015941 8.23956323 7.98138637 7.9771269 7.65456297\n", - " 7.41762075 7.34388951 7.3255817 7.1622021 7.15780645 6.97610602]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names8,30)" - ] - }, - { - "cell_type": "markdown", - "id": "be15f31b", - "metadata": { - "id": "be15f31b" - }, - "source": [ - "## Anomaly9" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47b97704", - "metadata": { - "id": "47b97704", - "outputId": "ed32ab90-36d5-454c-f462-e57b645ec521" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 68, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly9['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_9 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names9 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_9)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_9, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc584472", - "metadata": { - "id": "cc584472", - "outputId": "ad91fdeb-6773-486e-94bb-cecc8515ee58" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[4193 3054 1500 2352 3099 3652 4091 3623 4017 2608 5437 1434 5327 2677\n", - " 2817 4296 653 5516 3172 5492 625 5385 2918 3061 5068 2146 3780 391\n", - " 751 3482]\n", - "['肺炎', '武漢', '天氣', '持續', '氣溫', '發展', '罷工', '疫情', '經濟', '新冠', '風暴', '多雲', '雷暴', '日本', '最新', '英國', '全球', '驟雨', '法治', '香港', '內地', '項目', '東京', '死亡', '醫管局', '情人節', '確診', '企業', '冷鋒', '狂風']\n", - "[22.7117871 14.7962142 13.95923402 12.11395863 11.4467156 11.42776138\n", - " 11.0751255 11.00017054 10.68698703 10.47466373 10.29933445 9.96868688\n", - " 9.62183015 9.56760938 9.48042799 9.15872728 9.05108089 8.87426293\n", - " 8.71986076 8.51098414 8.40396877 8.2141112 8.18591121 8.1555543\n", - " 7.75654424 7.7231661 7.69543951 7.58740205 7.42240479 7.41505955]\n", - "[5395 1819 4680 3343 3623 4630 4028 2782 179 5492 1497 1349 2540 3097\n", - " 1828 2630 1032 2031 4463 2629 555 3271 5437 3568 3652 376 1679 309\n", - " 4549 1256]\n", - "['預算案', '山火', '財政', '澳洲', '疫情', '警告', '綠色', '暴雨', '中國', '香港', '天文台', '基金', '政府', '氣候變化', '岑智明', '新聞', '取消', '影響', '裁員', '新省', '億元', '減薪', '風暴', '生效', '發展', '以來', '寒冷', '交通', '計劃', '回應']\n", - "[18.65011177 13.65946084 13.4864836 13.20257306 9.72256163 9.63253825\n", - " 9.61575569 9.13329463 8.71260951 8.65674055 7.98492641 6.87273829\n", - " 6.51311878 6.50016791 6.37678562 5.89091673 5.86140892 5.81718975\n", - " 5.80779181 5.6925014 5.55840409 5.55113195 5.51076694 5.27471226\n", - " 5.24380575 5.21748959 5.14037173 5.13657866 4.96528318 4.93470888]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names9,30)" - ] - }, - { - "cell_type": "markdown", - "id": "8c621615", - "metadata": { - "id": "8c621615" - }, - "source": [ - "## Anomaly10" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c289ef77", - "metadata": { - "id": "c289ef77", - "outputId": "b5be380a-be7e-4efb-839a-ac97ee9b9f2c" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 70, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly10['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_10 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names10 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_10)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_10, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b87ab05e", - "metadata": { - "id": "b87ab05e", - "outputId": "314f623a-ef66-4c67-c608-a5e67c5699a2" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 2771 10115 10101 880 164 5750 5879 4569 3678 2777 6398 7766\n", - " 1264 6680 5651 7569 95 5262 4916 4780 10091 6840 5960 655\n", - " 1295 4315 5475 9239 5896 10212]\n", - "['天文台', '颱風', '風球', '信號', '三號', '沙德爾', '浪卡', '改發', '強風', '天氣', '熱帶', '考慮', '八號', '生效', '氣旋', '維持', '一號', '本港', '明日', '新聞', '風暴', '發出', '消息', '今日', '公里', '掛號', '機會', '逼近', '浪卡襲', '香港']\n", - "[116.02867242 75.27801599 75.15278519 59.3213883 51.8561299\n", - " 51.58829575 49.26837666 41.84094541 39.59446237 38.77571877\n", - " 38.63166269 36.91289835 35.7565145 35.55301688 34.6362126\n", - " 32.81276488 30.94523613 30.05853537 29.31787304 28.84127743\n", - " 28.3884404 26.77304337 26.58629385 25.4314269 25.39744584\n", - " 24.91795651 24.22357103 24.13429361 24.12318085 23.55347466]\n", - "[ 6844 337 10212 4231 6003 495 2011 2772 7743 2727 7559 1191\n", - " 9519 2339 5654 6805 2628 5206 6644 4140 7528 5129 719 10289\n", - " 2471 1551 8439 5098 9528 1272]\n", - "['發展', '中國', '香港', '持續', '深圳', '乾燥', '合作', '天晴', '習近平', '大致', '綠色', '全球', '金正恩', '國際', '氣溫', '疫情', '多雲', '服務', '環保', '投資', '經濟', '最高', '企業', '高度', '基金', '創新', '視察', '最低', '金融', '公司']\n", - "[37.03522628 36.87042448 30.5069981 26.84049652 25.47712694 21.95158618\n", - " 19.18957671 19.16189807 18.18166216 17.47402248 17.12388933 16.62115675\n", - " 15.80175442 15.37714642 14.72359882 14.58363692 14.25351759 14.23036963\n", - " 14.16530294 14.09826039 13.57178819 13.17237261 12.90190875 12.89796798\n", - " 12.03616497 12.02710229 11.87246362 11.79074695 11.46186608 11.29102309]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names10,30)" - ] - }, - { - "cell_type": "markdown", - "id": "d93114eb", - "metadata": { - "id": "d93114eb" - }, - "source": [ - "## Anomaly11" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "91d88c9e", - "metadata": { - "id": "91d88c9e", - "outputId": "19368920-1564-400a-8df1-fa12fd7235f8" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 72, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly11['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_11 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names11 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_11)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_11, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "46038a8f", - "metadata": { - "id": "46038a8f", - "outputId": "3b1b2092-533e-474c-e8ae-a2d7e10cedfb" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 2799 8466 5012 889 6602 2804 6745 10052 10229 5628 9746 2785\n", - " 6283 4764 3688 10242 10243 9721 649 84 5565 1922 2401 3253\n", - " 5196 5175 10013 1013 957 5587]\n", - "['天文台', '警告', '暴雨', '信號', '生效', '天氣', '發出', '香港', '黃色', '水浸', '雷暴', '大雨', '熱帶', '新聞', '影響', '黑色', '黑雨', '雨量', '今年', '一號', '毫米', '取消', '地區', '小時', '本港', '未來', '首個', '傾瀉', '停課', '氣旋']\n", - "[93.04562865 72.90236973 70.57353155 34.86067862 33.77397823 31.5481846\n", - " 29.03850943 26.10115796 25.35613051 23.90881811 23.8995282 23.30714026\n", - " 22.67480646 22.54769947 22.42485693 22.34676132 22.30998993 22.22541467\n", - " 22.09181568 20.92009677 20.91891708 20.90533795 19.92679985 19.92135594\n", - " 18.87782613 17.97653048 16.49850911 15.76361435 15.55697982 15.49121246]\n", - "[10104 5075 4222 2804 6752 9746 7455 5590 339 2649 9259 651\n", - " 4148 10052 9347 3309 4887 1163 9616 2474 10130 2029 8329 2763\n", - " 2360 3554 6558 7433 6955 2490]\n", - "['驟雨', '最高', '持續', '天氣', '發展', '雷暴', '綠色', '氣溫', '中國', '多雲', '酷熱', '今日', '投資', '香港', '金融', '局部地區', '明日', '全球', '陽光', '基金', '高度', '合作', '計劃', '大致', '國際', '幾陣', '環保', '經濟', '短暫', '報告']\n", - "[57.86270783 42.93529938 40.37357721 40.2562928 37.65618466 36.48167865\n", - " 35.64043528 34.78089752 28.4612732 27.75569876 25.73896605 24.93901288\n", - " 24.61601884 23.4075147 22.67568055 22.43042209 21.8638414 21.3668645\n", - " 19.86335499 19.62012992 18.28404394 18.26651518 18.23875818 18.09727007\n", - " 16.5995682 15.1803046 14.67150813 13.25882398 13.05720068 12.74019664]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names11,30)" - ] - }, - { - "cell_type": "markdown", - "id": "1854a071", - "metadata": { - "id": "1854a071" - }, - "source": [ - "## Anomaly12" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "7493cc2e", - "metadata": { - "id": "7493cc2e", - "outputId": "13c3718d-c394-4492-c3bb-d5520759ee7b" - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "\n", - "\n", - "
\n", - "" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 80, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "texts = anomaly12['keywords'].astype(str).to_list()\n", - "#chinese_stopwords=[\"、\",\"。\",\"〈\",\"〉\",\"《\",\"》\",\"一\",\"一些\",\"一何\",\"一切\",\"一則\",\"一方面\",\"一旦\",\"一來\",\"一樣\",\"一般\",\"一轉眼\",\"七\",\"萬一\",\"三\",\"上\",\"上下\",\"下\",\"不\",\"不僅\",\"不但\",\"不光\",\"不單\",\"不只\",\"不外乎\",\"不如\",\"不妨\",\"不盡\",\"不盡然\",\"不得\",\"不怕\",\"不惟\",\"不成\",\"不拘\",\"不料\",\"不是\",\"不比\",\"不然\",\"不特\",\"不獨\",\"不管\",\"不至於\",\"不若\",\"不論\",\"不過\",\"不問\",\"與\",\"與其\",\"與其說\",\"與否\",\"與此同時\",\"且\",\"且不說\",\"且說\",\"兩者\",\"個\",\"個別\",\"中\",\"臨\",\"為\",\"為了\",\"為什麽\",\"為何\",\"為止\",\"為此\",\"為著\",\"乃\",\"乃至\",\"乃至於\",\"麽\",\"之\",\"之一\",\"之所以\",\"之類\",\"烏乎\",\"乎\",\"乘\",\"九\",\"也\",\"也好\",\"也罷\",\"了\",\"二\",\"二來\",\"於\",\"於是\",\"於是乎\",\"雲雲\",\"雲爾\",\"五\",\"些\",\"亦\",\"人\",\"人們\",\"人家\",\"什\",\"什麽\",\"什麽樣\",\"今\",\"介於\",\"仍\",\"仍舊\",\"從\",\"從此\",\"從而\",\"他\",\"他人\",\"他們\",\"他們們\",\"以\",\"以上\",\"以為\",\"以便\",\"以免\",\"以及\",\"以故\",\"以期\",\"以來\",\"以至\",\"以至於\",\"以致\",\"們\",\"任\",\"任何\",\"任憑\",\"會\",\"似的\",\"但\",\"但凡\",\"但是\",\"何\",\"何以\",\"何況\",\"何處\",\"何時\",\"余外\",\"作為\",\"你\",\"你們\",\"使\",\"使得\",\"例如\",\"依\",\"依據\",\"依照\",\"便於\",\"俺\",\"俺們\",\"倘\",\"倘使\",\"倘或\",\"倘然\",\"倘若\",\"借\",\"借儻然\",\"假使\",\"假如\",\"假若\",\"做\",\"像\",\"兒\",\"先不先\",\"光是\",\"全體\",\"全部\",\"八\",\"六\",\"兮\",\"共\",\"關於\",\"關於具體地說\",\"其\",\"其一\",\"其中\",\"其二\",\"其他\",\"其余\",\"其它\",\"其次\",\"具體地說\",\"具體說來\",\"兼之\",\"內\",\"再\",\"再其次\",\"再則\",\"再有\",\"再者\",\"再者說\",\"再說\",\"冒\",\"沖\",\"況且\",\"幾\",\"幾時\",\"凡\",\"凡是\",\"憑\",\"憑借\",\"出於\",\"出來\",\"分\",\"分別\",\"則\",\"則甚\",\"別\",\"別人\",\"別處\",\"別是\",\"別的\",\"別管\",\"別說\",\"到\",\"前後\",\"前此\",\"前者\",\"加之\",\"加以\",\"即\",\"即令\",\"即使\",\"即便\",\"即如\",\"即或\",\"即若\",\"卻\",\"去\",\"又\",\"又及\",\"及\",\"及其\",\"及至\",\"反之\",\"反而\",\"反過來\",\"反過來說\",\"受到\",\"另\",\"另一方面\",\"另外\",\"另悉\",\"只\",\"只當\",\"只怕\",\"只是\",\"只有\",\"只消\",\"只要\",\"只限\",\"叫\",\"叮咚\",\"可\",\"可以\",\"可是\",\"可見\",\"各\",\"各個\",\"各位\",\"各種\",\"各自\",\"同\",\"同時\",\"後\",\"後者\",\"向\",\"向使\",\"向著\",\"嚇\",\"嗎\",\"否則\",\"吧\",\"吧噠\",\"含\",\"吱\",\"呀\",\"呃\",\"嘔\",\"唄\",\"嗚\",\"嗚呼\",\"呢\",\"呵\",\"呵呵\",\"呸\",\"呼哧\",\"咋\",\"和\",\"咚\",\"咦\",\"咧\",\"咱\",\"咱們\",\"咳\",\"哇\",\"哈\",\"哈哈\",\"哉\",\"哎\",\"哎呀\",\"哎喲\",\"嘩\",\"喲\",\"哦\",\"哩\",\"哪\",\"哪個\",\"哪些\",\"哪兒\",\"哪天\",\"哪年\",\"哪怕\",\"哪樣\",\"哪邊\",\"哪里\",\"哼\",\"哼唷\",\"唉\",\"唯有\",\"啊\",\"啐\",\"啥\",\"啦\",\"啪達\",\"啷當\",\"喂\",\"喏\",\"喔唷\",\"嘍\",\"嗡\",\"嗡嗡\",\"嗬\",\"嗯\",\"噯\",\"嘎\",\"嘎登\",\"噓\",\"嘛\",\"嘻\",\"嘿\",\"嘿嘿\",\"四\",\"因\",\"因為\",\"因了\",\"因此\",\"因著\",\"因而\",\"固然\",\"在\",\"在下\",\"在於\",\"地\",\"基於\",\"處在\",\"多\",\"多麽\",\"多少\",\"大\",\"大家\",\"她\",\"她們\",\"好\",\"如\",\"如上\",\"如上所述\",\"如下\",\"如何\",\"如其\",\"如同\",\"如是\",\"如果\",\"如此\",\"如若\",\"始而\",\"孰料\",\"孰知\",\"寧\",\"寧可\",\"寧願\",\"寧肯\",\"它\",\"它們\",\"對\",\"對於\",\"對待\",\"對方\",\"對比\",\"將\",\"小\",\"爾\",\"爾後\",\"爾爾\",\"尚且\",\"就\",\"就是\",\"就是了\",\"就是說\",\"就算\",\"就要\",\"盡\",\"盡管\",\"盡管如此\",\"豈但\",\"己\",\"已\",\"已矣\",\"巴\",\"巴巴\",\"年\",\"並\",\"並且\",\"庶乎\",\"庶幾\",\"開外\",\"開始\",\"歸\",\"歸齊\",\"當\",\"當地\",\"當然\",\"當著\",\"彼\",\"彼時\",\"彼此\",\"往\",\"待\",\"很\",\"得\",\"得了\",\"怎\",\"怎麽\",\"怎麽辦\",\"怎麽樣\",\"怎奈\",\"怎樣\",\"總之\",\"總的來看\",\"總的來說\",\"總的說來\",\"總而言之\",\"恰恰相反\",\"您\",\"惟其\",\"慢說\",\"我\",\"我們\",\"或\",\"或則\",\"或是\",\"或曰\",\"或者\",\"截至\",\"所\",\"所以\",\"所在\",\"所幸\",\"所有\",\"才\",\"才能\",\"打\",\"打從\",\"把\",\"抑或\",\"拿\",\"按\",\"按照\",\"換句話說\",\"換言之\",\"據\",\"據此\",\"接著\",\"故\",\"故此\",\"故而\",\"旁人\",\"無\",\"無寧\",\"無論\",\"既\",\"既往\",\"既是\",\"既然\",\"日\",\"時\",\"時候\",\"是\",\"是以\",\"是的\",\"更\",\"曾\",\"替\",\"替代\",\"最\",\"月\",\"有\",\"有些\",\"有關\",\"有及\",\"有時\",\"有的\",\"望\",\"朝\",\"朝著\",\"本\",\"本人\",\"本地\",\"本著\",\"本身\",\"來\",\"來著\",\"來自\",\"來說\",\"極了\",\"果然\",\"果真\",\"某\",\"某個\",\"某些\",\"某某\",\"根據\",\"歟\",\"正值\",\"正如\",\"正巧\",\"正是\",\"此\",\"此地\",\"此處\",\"此外\",\"此時\",\"此次\",\"此間\",\"毋寧\",\"每\",\"每當\",\"比\",\"比及\",\"比如\",\"比方\",\"沒奈何\",\"沿\",\"沿著\",\"漫說\",\"焉\",\"然則\",\"然後\",\"然而\",\"照\",\"照著\",\"猶且\",\"猶自\",\"甚且\",\"甚麽\",\"甚或\",\"甚而\",\"甚至\",\"甚至於\",\"用\",\"用來\",\"由\",\"由於\",\"由是\",\"由此\",\"由此可見\",\"的\",\"的確\",\"的話\",\"直到\",\"相對而言\",\"省得\",\"看\",\"眨眼\",\"著\",\"著呢\",\"矣\",\"矣乎\",\"矣哉\",\"離\",\"秒\",\"竟而\",\"第\",\"等\",\"等到\",\"等等\",\"簡言之\",\"管\",\"類如\",\"緊接著\",\"縱\",\"縱令\",\"縱使\",\"縱然\",\"經\",\"經過\",\"結果\",\"給\",\"繼之\",\"繼後\",\"繼而\",\"綜上所述\",\"罷了\",\"者\",\"而\",\"而且\",\"而況\",\"而後\",\"而外\",\"而已\",\"而是\",\"而言\",\"能\",\"能否\",\"騰\",\"自\",\"自個兒\",\"自從\",\"自各兒\",\"自後\",\"自家\",\"自己\",\"自打\",\"自身\",\"至\",\"至於\",\"至今\",\"至若\",\"致\",\"般的\",\"若\",\"若夫\",\"若是\",\"若果\",\"若非\",\"莫不然\",\"莫如\",\"莫若\",\"雖\",\"雖則\",\"雖然\",\"雖說\",\"被\",\"要\",\"要不\",\"要不是\",\"要不然\",\"要麽\",\"要是\",\"譬喻\",\"譬如\",\"讓\",\"許多\",\"論\",\"設使\",\"設或\",\"設若\",\"誠如\",\"誠然\",\"該\",\"說\",\"說來\",\"請\",\"諸\",\"諸位\",\"諸如\",\"誰\",\"誰人\",\"誰料\",\"誰知\",\"賊死\",\"賴以\",\"趕\",\"起\",\"起見\",\"趁\",\"趁著\",\"越是\",\"距\",\"跟\",\"較\",\"較之\",\"邊\",\"過\",\"還\",\"還是\",\"還有\",\"還要\",\"這\",\"這一來\",\"這個\",\"這麽\",\"這麽些\",\"這麽樣\",\"這麽點兒\",\"這些\",\"這會兒\",\"這兒\",\"這就是說\",\"這時\",\"這樣\",\"這次\",\"這般\",\"這邊\",\"這里\",\"進而\",\"連\",\"連同\",\"逐步\",\"通過\",\"遵循\",\"遵照\",\"那\",\"那個\",\"那麽\",\"那麽些\",\"那麽樣\",\"那些\",\"那會兒\",\"那兒\",\"那時\",\"那樣\",\"那般\",\"那邊\",\"那里\",\"都\",\"鄙人\",\"鑒於\",\"針對\",\"阿\",\"除\",\"除了\",\"除外\",\"除開\",\"除此之外\",\"除非\",\"隨\",\"隨後\",\"隨時\",\"隨著\",\"難道說\",\"零\",\"非\",\"非但\",\"非徒\",\"非特\",\"非獨\",\"靠\",\"順\",\"順著\",\"首先\",\" \",\"︿\",\"!\",\"#\",\"$\",\"%\",\"&\",\"(\",\")\",\"*\",\"+\",\",\",\"0\",\"1\",\"2\",\"3\",\"4\",\"5\",\"6\",\"7\",\"8\",\"9\",\":\",\";\",\"<\",\">\",\"?\",\"@\",\"[\",\"]\",\"{\",\"|\",\"}\",\"~\",\"¥\"]\n", - "\n", - "# the vectorizer object will be used to transform text to vector form\n", - "vectorizer = TfidfVectorizer(tokenizer=jieba.lcut, token_pattern='\\w+|\\$[\\d\\.]+|\\S+', stop_words = [\" \"])\n", - "\n", - "# apply transformation\n", - "tf_12 = vectorizer.fit_transform(texts)\n", - "\n", - "# tf_feature_names tells us what word each column in the matric represents\n", - "tf_feature_names12 = vectorizer.get_feature_names()\n", - "\n", - "#Distribute topics\n", - "lda_tfidf = LatentDirichletAllocation(n_components=2, random_state=0)\n", - "lda_tfidf.fit(tf_12)\n", - "\n", - "#Display result\n", - "vis = pyLDAvis.sklearn.prepare(lda_tfidf, tf_12, vectorizer)\n", - "pyLDAvis.enable_notebook()\n", - "pyLDAvis.display(vis)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6d21cf6b", - "metadata": { - "id": "6d21cf6b", - "outputId": "a6d51020-468b-4e7a-c3a0-c156deb16f0a" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[ 6627 13337 9014 3022 13706 13236 7291 3797 2215 7740 6713 7746\n", - " 2416 4918 10134 5263 434 5615 13197 1529 6628 13174 8972 2564\n", - " 9406 13341 4949 10477 7786 9342]\n", - "['暴雨', '颱風', '登陸', '四川', '黑格', '預警', '死亡', '失蹤', '北京', '洪水', '最高', '洪災', '南韓', '強降雨', '美國', '應急', '中國', '持續', '響應', '內地', '暴雨成災', '韓國', '疫情', '受災', '福建', '颶風', '影響', '至少', '浙江', '確診']\n", - "[81.34192864 65.67391808 51.07854106 41.1277383 37.8975487 37.72706202\n", - " 36.60172303 33.10025266 32.66198876 32.62356581 31.05246951 29.06524999\n", - " 27.27813714 26.64633713 25.65122017 25.56340609 25.25798228 25.09534105\n", - " 24.55538506 23.97049293 23.80379168 23.60394405 23.53248114 23.49574483\n", - " 22.66771037 21.85447114 21.83324997 21.30591322 21.09383705 19.29984676]\n", - "[ 3724 13585 13322 1184 3730 13337 13540 12411 13479 13064 200 1667\n", - " 8814 6031 6298 11335 8432 839 6871 4919 434 11051 8252 3384\n", - " 8047 9026 271 6837 7433 13579]\n", - "['天文台', '高斯', '風球', '信號', '天氣', '颱風', '驟雨', '酷熱', '香港', '雷暴', '三號', '八號', '生效', '改發', '新聞', '警告', '熱帶', '今年', '本港', '強風', '中國', '襲港', '澳門', '增強', '港股', '發展', '下午', '未來', '氣旋', '高度']\n", - "[111.20777858 87.73500998 70.72879868 62.72377542 59.08685389\n", - " 52.70133643 49.99282774 42.31984129 42.20480964 39.70873062\n", - " 38.6318031 38.20356342 38.17447052 36.37993738 34.49663645\n", - " 34.25815842 33.24165752 33.22310064 32.19971281 31.75837627\n", - " 31.09523802 30.95715184 30.63384365 29.3477748 28.43147891\n", - " 28.24302655 26.02302687 25.56560093 25.38828359 25.19760987]\n" - ] - } - ], - "source": [ - "plot_top_words(lda_tfidf,tf_feature_names12,30)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f41b8c49", - "metadata": { - "id": "f41b8c49" - }, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bdf68720", - "metadata": { - "id": "bdf68720" - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.12" - }, - "colab": { - "provenance": [], - "include_colab_link": true - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} \ No newline at end of file