Skip to content

Basic usage of ijson to load a json file

rNLKJA edited this page Mar 20, 2023 · 1 revision
class Twitter:
    def __init__(self):
        self._id: str = None
        self.author: str = None
        self.locat: str = None
        
    def __repr__(self):
        return f"{self._id} | {self.author} | {self.locat}"

filename = "../data/twitter-data-small.json"

with open(filename, "r") as f:
    parser = ijson.parse(f)
    current_chunk = []
        
    for prefix, event, value in ijson.parse(f):
        
        if prefix == 'item' and event == 'start_map':
            current_chunk.append(Twitter())
        
        elif prefix == 'item._id' and event == 'string' and value is not None:
            current_chunk[-1]._id = np.int64(value)
        
        elif prefix == "item.data.author_id" and event == 'string' and value is not None:
            current_chunk[-1].author = np.int64(value)
        
        elif prefix == "item.includes.places.item.full_name" and event == 'string' and value is not None:
            current_chunk[-1].locat = value
    
    # return a pandas dataframe         
    jdf = pd.DataFrame([item.__dict__ for item in current_chunk])

Need to consider:

  • How to divide a large JSON file into a number of different chunks (based on processor numbers) & load it into a data frame
  • Need consider the data type for twitter_id, author_id, for faster computation purpose
  • Need a function to handle string matching between location and sal.parquet
  • Need three functions to compute and store the final result
  • Need a file to store the time consumption based on different No.processor usage