@@ -62,6 +62,7 @@ import value;
62
62
import catalog;
63
63
import catalog_delta_entry;
64
64
import build_fast_rough_filter_task;
65
+ import stream_io;
65
66
66
67
namespace infinity {
67
68
@@ -428,34 +429,15 @@ void PhysicalImport::ImportCSV(QueryContext *query_context, ImportOperatorState
428
429
}
429
430
430
431
void PhysicalImport::ImportJSONL (QueryContext *query_context, ImportOperatorState *import_op_state) {
431
- LocalFileSystem fs;
432
- auto [file_handler, status] = fs.OpenFile (file_path_, FileFlags::READ_FLAG, FileLockType::kReadLock );
433
- if (!status.ok ()) {
434
- UnrecoverableError (status.message ());
435
- }
436
- DeferFn file_defer ([&]() { fs.Close (*file_handler); });
437
-
438
- SizeT file_size = fs.GetFileSize (*file_handler);
439
- String jsonl_str (file_size + 1 , 0 );
440
- SizeT read_n = file_handler->Read (jsonl_str.data (), file_size);
441
- if (read_n != file_size) {
442
- String error_message = fmt::format (" Read file size {} doesn't match with file size {}." , read_n, file_size);
443
- LOG_CRITICAL (error_message);
444
- UnrecoverableError (error_message);
445
- }
446
-
447
- if (read_n == 0 ) {
448
- auto result_msg = MakeUnique<String>(fmt::format (" Empty JSONL file, IMPORT 0 Rows" ));
449
- import_op_state->result_msg_ = std::move (result_msg);
450
- return ;
451
- }
432
+ StreamIO stream_io;
433
+ stream_io.Init (file_path_, FileFlags::READ_FLAG);
434
+ DeferFn file_defer ([&]() { stream_io.Close (); });
452
435
453
436
Txn *txn = query_context->GetTxn ();
454
437
u64 segment_id = Catalog::GetNextSegmentID (table_entry_);
455
438
SharedPtr<SegmentEntry> segment_entry = SegmentEntry::NewSegmentEntry (table_entry_, segment_id, txn);
456
439
UniquePtr<BlockEntry> block_entry = BlockEntry::NewBlockEntry (segment_entry.get (), 0 , 0 , table_entry_->ColumnCount (), txn);
457
440
458
- SizeT start_pos = 0 ;
459
441
Vector<ColumnVector> column_vectors;
460
442
for (SizeT i = 0 ; i < table_entry_->ColumnCount (); ++i) {
461
443
auto *block_column_entry = block_entry->GetColumnBlockEntry (i);
@@ -464,8 +446,34 @@ void PhysicalImport::ImportJSONL(QueryContext *query_context, ImportOperatorStat
464
446
465
447
SizeT row_count{0 };
466
448
while (true ) {
467
- if (start_pos >= file_size) {
449
+ String json_str;
450
+ if (stream_io.ReadLine (json_str)) {
451
+ nlohmann::json line_json = nlohmann::json::parse (json_str);
452
+
453
+ JSONLRowHandler (line_json, column_vectors);
454
+ block_entry->IncreaseRowCount (1 );
455
+ ++row_count;
456
+
457
+ if (block_entry->GetAvailableCapacity () <= 0 ) {
458
+ LOG_DEBUG (fmt::format (" Block {} saved, total rows: {}" , block_entry->block_id (), row_count));
459
+ segment_entry->AppendBlockEntry (std::move (block_entry));
460
+ if (segment_entry->Room () <= 0 ) {
461
+ LOG_DEBUG (fmt::format (" Segment {} saved, total rows: {}" , segment_entry->segment_id (), row_count));
462
+ SaveSegmentData (table_entry_, txn, segment_entry);
463
+ u64 segment_id = Catalog::GetNextSegmentID (table_entry_);
464
+ segment_entry = SegmentEntry::NewSegmentEntry (table_entry_, segment_id, txn);
465
+ }
466
+
467
+ block_entry = BlockEntry::NewBlockEntry (segment_entry.get (), segment_entry->GetNextBlockID (), 0 , table_entry_->ColumnCount (), txn);
468
+ column_vectors.clear ();
469
+ for (SizeT i = 0 ; i < table_entry_->ColumnCount (); ++i) {
470
+ auto *block_column_entry = block_entry->GetColumnBlockEntry (i);
471
+ column_vectors.emplace_back (block_column_entry->GetColumnVector (txn->buffer_mgr ()));
472
+ }
473
+ }
474
+ } else {
468
475
if (block_entry->row_count () == 0 ) {
476
+ column_vectors.clear ();
469
477
std::move (*block_entry).Cleanup ();
470
478
} else {
471
479
segment_entry->AppendBlockEntry (std::move (block_entry));
@@ -474,39 +482,10 @@ void PhysicalImport::ImportJSONL(QueryContext *query_context, ImportOperatorStat
474
482
std::move (*segment_entry).Cleanup ();
475
483
} else {
476
484
SaveSegmentData (table_entry_, txn, segment_entry);
485
+ LOG_DEBUG (fmt::format (" Last segment {} saved, total rows: {}" , segment_entry->segment_id (), row_count));
477
486
}
478
487
break ;
479
488
}
480
- SizeT end_pos = jsonl_str.find (' \n ' , start_pos);
481
- if (end_pos == String::npos) {
482
- end_pos = file_size;
483
- }
484
- std::string_view json_sv (jsonl_str.data () + start_pos, end_pos - start_pos);
485
- start_pos = end_pos + 1 ;
486
-
487
- nlohmann::json line_json = nlohmann::json::parse (json_sv);
488
-
489
- JSONLRowHandler (line_json, column_vectors);
490
- block_entry->IncreaseRowCount (1 );
491
- ++ row_count;
492
-
493
- if (block_entry->GetAvailableCapacity () <= 0 ) {
494
- LOG_DEBUG (fmt::format (" Block {} saved" , block_entry->block_id ()));
495
- segment_entry->AppendBlockEntry (std::move (block_entry));
496
- if (segment_entry->Room () <= 0 ) {
497
- LOG_DEBUG (fmt::format (" Segment {} saved" , segment_entry->segment_id ()));
498
- SaveSegmentData (table_entry_, txn, segment_entry);
499
- u64 segment_id = Catalog::GetNextSegmentID (table_entry_);
500
- segment_entry = SegmentEntry::NewSegmentEntry (table_entry_, segment_id, txn);
501
- }
502
-
503
- block_entry = BlockEntry::NewBlockEntry (segment_entry.get (), segment_entry->GetNextBlockID (), 0 , table_entry_->ColumnCount (), txn);
504
- column_vectors.clear ();
505
- for (SizeT i = 0 ; i < table_entry_->ColumnCount (); ++i) {
506
- auto *block_column_entry = block_entry->GetColumnBlockEntry (i);
507
- column_vectors.emplace_back (block_column_entry->GetColumnVector (txn->buffer_mgr ()));
508
- }
509
- }
510
489
}
511
490
512
491
auto result_msg = MakeUnique<String>(fmt::format (" IMPORT {} Rows" , row_count));
0 commit comments