1
1
import sys
2
- from typing import Dict , Optional , Union , Any , List
2
+ import hashlib
3
+ import pandas as pd
4
+ from typing import Dict , Optional , Union , Any , List , Tuple
5
+ from typing_extensions import Literal
3
6
4
7
from pygwalker .data_parsers .base import BaseDataParser , FieldSpec
5
8
from pygwalker .data_parsers .database_parser import Connector
6
9
from pygwalker ._typing import DataFrame
7
10
8
11
__classname2method = {}
9
12
13
+ DatasetType = Literal ['pandas' , 'polars' , 'modin' , 'pyspark' , 'connector' , 'cloud_dataset' ]
14
+
10
15
11
16
# pylint: disable=import-outside-toplevel
12
- def _get_data_parser (dataset : Union [DataFrame , Connector , str ]) -> BaseDataParser :
17
+ def _get_data_parser (dataset : Union [DataFrame , Connector , str ]) -> Tuple [ BaseDataParser , DatasetType ] :
13
18
"""
14
19
Get DataFrameDataParser for dataset
15
20
TODO: Maybe you can find a better way to handle the following code
16
21
"""
17
22
if type (dataset ) in __classname2method :
18
23
return __classname2method [type (dataset )]
19
24
20
- if 'pandas' in sys .modules :
21
- import pandas as pd
22
- if isinstance (dataset , pd .DataFrame ):
23
- from pygwalker .data_parsers .pandas_parser import PandasDataFrameDataParser
24
- __classname2method [pd .DataFrame ] = PandasDataFrameDataParser
25
- return __classname2method [pd .DataFrame ]
25
+ if isinstance (dataset , pd .DataFrame ):
26
+ from pygwalker .data_parsers .pandas_parser import PandasDataFrameDataParser
27
+ __classname2method [pd .DataFrame ] = (PandasDataFrameDataParser , "pandas" )
28
+ return __classname2method [pd .DataFrame ]
26
29
27
30
if 'polars' in sys .modules :
28
31
import polars as pl
29
32
if isinstance (dataset , pl .DataFrame ):
30
33
from pygwalker .data_parsers .polars_parser import PolarsDataFrameDataParser
31
- __classname2method [pl .DataFrame ] = PolarsDataFrameDataParser
34
+ __classname2method [pl .DataFrame ] = ( PolarsDataFrameDataParser , "polars" )
32
35
return __classname2method [pl .DataFrame ]
33
36
34
37
if 'modin.pandas' in sys .modules :
35
38
from modin import pandas as mpd
36
39
if isinstance (dataset , mpd .DataFrame ):
37
40
from pygwalker .data_parsers .modin_parser import ModinPandasDataFrameDataParser
38
- __classname2method [mpd .DataFrame ] = ModinPandasDataFrameDataParser
41
+ __classname2method [mpd .DataFrame ] = ( ModinPandasDataFrameDataParser , "modin" )
39
42
return __classname2method [mpd .DataFrame ]
40
43
41
44
if 'pyspark' in sys .modules :
42
45
from pyspark .sql import DataFrame as SparkDataFrame
43
46
if isinstance (dataset , SparkDataFrame ):
44
47
from pygwalker .data_parsers .spark_parser import SparkDataFrameDataParser
45
- __classname2method [SparkDataFrame ] = SparkDataFrameDataParser
48
+ __classname2method [SparkDataFrame ] = ( SparkDataFrameDataParser , "pyspark" )
46
49
return __classname2method [SparkDataFrame ]
47
50
48
51
if isinstance (dataset , Connector ):
49
52
from pygwalker .data_parsers .database_parser import DatabaseDataParser
50
- __classname2method [DatabaseDataParser ] = DatabaseDataParser
53
+ __classname2method [DatabaseDataParser ] = ( DatabaseDataParser , "connector" )
51
54
return __classname2method [DatabaseDataParser ]
52
55
53
56
if isinstance (dataset , str ):
54
57
from pygwalker .data_parsers .cloud_dataset_parser import CloudDatasetParser
55
- __classname2method [CloudDatasetParser ] = CloudDatasetParser
58
+ __classname2method [CloudDatasetParser ] = ( CloudDatasetParser , "cloud_dataset" )
56
59
return __classname2method [CloudDatasetParser ]
57
60
58
61
raise TypeError (f"Unsupported data type: { type (dataset )} " )
@@ -70,11 +73,45 @@ def get_parser(
70
73
if other_params is None :
71
74
other_params = {}
72
75
73
- parser = _get_data_parser (dataset )(
76
+ parser_func , _ = _get_data_parser (dataset )
77
+ parser = parser_func (
74
78
dataset ,
75
79
field_specs ,
76
80
infer_string_to_date ,
77
81
infer_number_to_dimension ,
78
82
other_params
79
83
)
80
84
return parser
85
+
86
+
87
+ def get_dataset_hash (dataset : Union [DataFrame , Connector , str ]) -> str :
88
+ """Just a less accurate way to get different dataset hash values."""
89
+ _ , dataset_type = _get_data_parser (dataset )
90
+ if dataset_type in ["pandas" , "modin" , "polars" ]:
91
+ row_count = dataset .shape [0 ]
92
+ other_info = str (dataset .shape ) + "_" + dataset_type
93
+ if row_count > 4000 :
94
+ dataset = dataset [:2000 ] + dataset [- 2000 :]
95
+ if dataset_type == "modin" :
96
+ dataset = dataset ._to_pandas ()
97
+ if dataset_type in ["pandas" , "modin" ]:
98
+ hash_bytes = pd .util .hash_pandas_object (dataset ).values .tobytes () + other_info .encode ()
99
+ else :
100
+ hash_bytes = dataset .hash_rows ().to_numpy ().tobytes () + other_info .encode ()
101
+ return hashlib .md5 (hash_bytes ).hexdigest ()
102
+
103
+ if dataset_type == "pyspark" :
104
+ shape = ((dataset .count (), len (dataset .columns )))
105
+ row_count = shape [0 ]
106
+ other_info = str (shape ) + "_" + dataset_type
107
+ if row_count > 4000 :
108
+ dataset = dataset .limit (4000 )
109
+ dataset_pd = dataset .toPandas ()
110
+ hash_bytes = pd .util .hash_pandas_object (dataset_pd ).values .tobytes () + other_info .encode ()
111
+ return hashlib .md5 (hash_bytes ).hexdigest ()
112
+
113
+ if dataset_type == "connector" :
114
+ return hashlib .md5 ("_" .join ([dataset .url , dataset .view_sql , dataset_type ]).encode ()).hexdigest ()
115
+
116
+ if dataset_type == "cloud_dataset" :
117
+ return hashlib .md5 ("_" .join ([dataset , dataset_type ]).encode ()).hexdigest ()
0 commit comments