Skip to content

Commit

Permalink
ORC-1571: Supports displaying raw data size in the meta command of …
Browse files Browse the repository at this point in the history
…orc-tools

### What changes were proposed in this pull request?
Display raw data size in `meta` command

### Why are the changes needed?
We can directly see the compressed orc file size and uncompressed data size.
Like `parquet-cli`

```
Row group 0:  count: 1000  210.95 B records  start: 4  total(compressed): 206.006 kB total(uncompressed):10.733 MB
```

### How was this patch tested?
UT

Closes #1726 from cxzl25/ORC-1571.

Authored-by: sychen <sychen@ctrip.com>
Signed-off-by: Dongjoon Hyun <dongjoon@apache.org>
  • Loading branch information
cxzl25 authored and dongjoon-hyun committed Jan 5, 2024
1 parent 68f4a8b commit bef54fc
Show file tree
Hide file tree
Showing 8 changed files with 13 additions and 3 deletions.
8 changes: 5 additions & 3 deletions java/tools/src/java/org/apache/orc/tools/FileDump.java
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,13 @@ private FileDump() {
}

public static void main(Configuration conf, String[] args) throws Exception {
List<Integer> rowIndexCols = new ArrayList<Integer>(0);
List<Integer> rowIndexCols = new ArrayList<>(0);
Options opts = createOptions();
CommandLine cli = new DefaultParser().parse(opts, args);

if (cli.hasOption('h')) {
HelpFormatter formatter = new HelpFormatter();
formatter.printHelp("orcfiledump", opts);
formatter.printHelp("meta", opts);
return;
}

Expand All @@ -103,7 +103,7 @@ public static void main(Configuration conf, String[] args) throws Exception {
rowIndexCols = null; // All the columns
} else {
String[] colStrs = cli.getOptionValue("r").split(",");
rowIndexCols = new ArrayList<Integer>(colStrs.length);
rowIndexCols = new ArrayList<>(colStrs.length);
for (String colStr : colStrs) {
rowIndexCols.add(Integer.parseInt(colStr));
}
Expand Down Expand Up @@ -437,10 +437,12 @@ private static void printMetaDataImpl(final String filename,

FileSystem fs = file.getFileSystem(conf);
long fileLen = fs.getFileStatus(file).getLen();
long rawDataSize = reader.getRawDataSize();
long paddedBytes = getTotalPaddingSize(reader);
double percentPadding = (fileLen == 0) ? 0.0d : 100.0d * paddedBytes / fileLen;
DecimalFormat format = new DecimalFormat("##.##");
System.out.println("\nFile length: " + fileLen + " bytes");
System.out.println("File raw data size: " + rawDataSize + " bytes");
System.out.println("Padding length: " + paddedBytes + " bytes");
System.out.println("Padding ratio: " + format.format(percentPadding) + "%");
//print out any user metadata properties
Expand Down
2 changes: 2 additions & 0 deletions java/tools/src/java/org/apache/orc/tools/JsonFileDump.java
Original file line number Diff line number Diff line change
Expand Up @@ -207,10 +207,12 @@ public static void printJsonMetaData(List<String> files,

FileSystem fs = path.getFileSystem(conf);
long fileLen = fs.getContentSummary(path).getLength();
long rawDataSize = reader.getRawDataSize();
long paddedBytes = FileDump.getTotalPaddingSize(reader);
// empty ORC file is ~45 bytes. Assumption here is file length always >0
double percentPadding = ((double) paddedBytes / (double) fileLen) * 100;
writer.name("fileLength").value(fileLen);
writer.name("rawDataSize").value(rawDataSize);
writer.name("paddingLength").value(paddedBytes);
writer.name("paddingRatio").value(percentPadding);
AcidStats acidStats = OrcAcidUtils.parseAcidStats(reader);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ Stripes:
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 238 loadFactor: 0.0248 expectedFpp: 5.7562566E-12

File length: 275025 bytes
File raw data size: 2163000 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,7 @@ Stripes:
Stripe level merge: numHashFunctions: 7 bitCount: 9600 popCount: 4948 loadFactor: 0.5154 expectedFpp: 0.00966294

File length: 332566 bytes
File raw data size: 2163000 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ Stripes:
Entry 0: count: 1000 hasNull: false min: Darkness,-230-368-488-586-862-930-1686-2044-2636-2652-2872-3108-3162-3192-3404-3442-3508-3542-3550-3712-3980-4146-4204-4336-4390-4418-4424-4490-4512-4650-4768-4924-4950-5210-5524-5630-5678-5710-5758-5952-6238-6252-6300-6366-6668-6712-6926-6942-7100-7194-7802-8030-8452-8608-8640-8862-8868-9134-9234-9412-9602-9608-9642-9678-9740-9780-10426-10510-10514-10706-10814-10870-10942-11028-11244-11326-11462-11496-11656-11830-12022-12178-12418-12832-13304-13448-13590-13618-13908-14188-14246-14340-14364-14394-14762-14850-14964-15048-15494-15674-15726-16006-16056-16180-16304-16332-16452-16598-16730-16810-16994-17210-17268-17786-17962-18214-18444-18446-18724-18912-18952-19164-19348-19400-19546-19776-19896-20084 max: worst-54-290-346-648-908-996-1038-1080-1560-1584-1620-1744-1770-1798-1852-1966-2162-2244-2286-2296-2534-2660-3114-3676-3788-4068-4150-4706-4744-5350-5420-5582-5696-5726-6006-6020-6024-6098-6184-6568-6636-6802-6994-7004-7318-7498-7758-7780-7798-7920-7952-7960-7988-8232-8256-8390-8416-8478-8620-8840-8984-9038-9128-9236-9248-9344-9594-9650-9714-9928-9938-10178-10368-10414-10502-10732-10876-11008-11158-11410-11722-11836-11964-12054-12096-12126-12136-12202-12246-12298-12616-12774-12782-12790-12802-12976-13216-13246-13502-13766-14454-14974-15004-15124-15252-15294-15356-15530-15610-16316-16936-17024-17122-17214-17310-17528-17682-17742-17870-17878-18010-18410-18524-18788-19204-19254-19518-19596-19786-19874-19904-20390-20752-20936 sum: 670762 positions: 0,0,0,0,0

File length: 2217712 bytes
File raw data size: 9009000 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
Expand Down
1 change: 1 addition & 0 deletions java/tools/src/test/resources/orc-file-dump.json
Original file line number Diff line number Diff line change
Expand Up @@ -1377,6 +1377,7 @@
}
],
"fileLength": 275003,
"rawDataSize": 2144730,
"paddingLength": 0,
"paddingRatio": 0.0,
"status": "OK"
Expand Down
1 change: 1 addition & 0 deletions java/tools/src/test/resources/orc-file-dump.out
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,7 @@ Stripes:
Entry 0: count: 1000 hasNull: false min: Darkness, max: worst sum: 3866 positions: 0,0,0

File length: 271049 bytes
File raw data size: 2163000 bytes
Padding length: 0 bytes
Padding ratio: 0%

Expand Down
1 change: 1 addition & 0 deletions java/tools/src/test/resources/orc-file-has-null.out
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ Stripes:
Entry 4: count: 0 hasNull: true positions: 0,6,110,0,0,0,0

File length: 1844 bytes
File raw data size: 770000 bytes
Padding length: 0 bytes
Padding ratio: 0%
________________________________________________________________________________________________________________________
Expand Down

0 comments on commit bef54fc

Please sign in to comment.