Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added flag and tests for direct raw (unformatted) data handling #88

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,13 @@
![Version](https://img.shields.io/maven-central/v/com.epam/parso)

# Parso Java library

## Parso 2.0.15
***XX MMM 2021***

* Added the ability to read raw data (unformatted) from SAS files. This allows for comparable output compared to SAS unformatted export and is the fastest option to read SAS data. Made the getColumns call of SasFileParser public.


## Parso 2.0.14
***19 February 2021***

Expand Down
21 changes: 17 additions & 4 deletions pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
<modelVersion>4.0.0</modelVersion>
<groupId>com.epam</groupId>
<artifactId>parso</artifactId>
<version>2.0.15-SNAPSHOT</version>
<version>2.0.15-RC1</version>
<packaging>jar</packaging>
<name>parso</name>
<description>Parso is a lightweight Java library designed to read SAS7BDAT datasets. The Parso interfaces
Expand Down Expand Up @@ -43,7 +43,11 @@
<name>Matthew Kastin</name>
<email>Fried.Egg@verizon.net</email>
</contributor>
</contributors>
<contributor>
<name>Ronald Steinhau</name>
<email>st@entimo.de</email>
</contributor>
</contributors>

<scm>
<connection>scm:git:git@github.com:epam/parso.git</connection>
Expand All @@ -56,9 +60,13 @@
<id>ossrh</id>
<url>https://oss.sonatype.org/content/repositories/snapshots</url>
</snapshotRepository>
<repository>
<!--<repository>
<id>ossrh</id>
<url>https://oss.sonatype.org/service/local/staging/deploy/maven2/</url>
</repository>-->
<repository>
<id>entimo-nexus</id>
<url>http://auto-test-w002.vm.entimo.de:8081/repository/maven-releases/</url>
</repository>
</distributionManagement>

Expand Down Expand Up @@ -121,10 +129,15 @@
<phase>deploy</phase>
</execution>
</executions>
<configuration>
<!--<configuration>
<serverId>ossrh</serverId>
<nexusUrl>https://oss.sonatype.org/</nexusUrl>
<autoReleaseAfterClose>false</autoReleaseAfterClose>
</configuration>-->
<configuration>
<serverId>entimo-nexus</serverId>
<nexusUrl>http://auto-test-w002.vm.entimo.de:8081/repository/maven-releases/</nexusUrl>
<autoReleaseAfterClose>false</autoReleaseAfterClose>
</configuration>
</plugin>
<plugin>
Expand Down
2 changes: 1 addition & 1 deletion src/main/java/com/epam/parso/impl/CSVDataWriterImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ public class CSVDataWriterImpl extends AbstractCSVWriter implements CSVDataWrite
* The map to store (@link Column#id) column identifier and the formatter
* for converting locale-sensitive values stored in this column into string.
*/
private final Map<Integer, Format> columnFormatters = new HashMap<>();
private Map<Integer, Format> columnFormatters = new HashMap<>();

/**
* The constructor that defines writer variable to output result csv file.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,6 +196,6 @@ public void writeSasFileProperties(SasFileProperties sasFileProperties) throws I
* @throws IOException appears if the output into writer is impossible.
*/
private void constructPropertiesString(String propertyName, Object property) throws IOException {
getWriter().write(propertyName + property + "\n");
getWriter().write(propertyName + String.valueOf(property) + "\n");
}
}
25 changes: 23 additions & 2 deletions src/main/java/com/epam/parso/impl/SasFileParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ public final class SasFileParser {
*/
private final OutputDateType outputDateType;

/**
* The flag to use data output as unformatted data.
*/
private final Boolean unformatted;
/**
* The list of current page data subheaders.
*/
Expand Down Expand Up @@ -265,6 +269,8 @@ private SasFileParser(Builder builder) {
sasFileStream = new DataInputStream(builder.sasFileStream);
byteOutput = builder.byteOutput;
outputDateType = builder.outputDateType;
unformatted = builder.unformatted;


Map<SubheaderIndexes, ProcessingSubheader> tmpMap = new HashMap<>();
tmpMap.put(SubheaderIndexes.ROW_SIZE_SUBHEADER_INDEX, new RowSizeSubheader());
Expand Down Expand Up @@ -892,7 +898,7 @@ private Object processElement(byte[] source, int offset, int currentColumnIndex)
if (columnsDataLength.get(currentColumnIndex) <= 2) {
return bytesToShort(temp);
} else {
if (columns.get(currentColumnIndex).getFormat().getName().isEmpty()) {
if (unformatted || columns.get(currentColumnIndex).getFormat().getName().isEmpty()) {
return convertByteArrayToNumber(temp);
} else {
ColumnFormat columnFormat = columns.get(currentColumnIndex).getFormat();
Expand Down Expand Up @@ -1188,7 +1194,7 @@ private byte[] trimBytesArray(byte[] source, int offset, int length) {
*
* @return columns list.
*/
List<Column> getColumns() {
public List<Column> getColumns() {
return columns;
}

Expand Down Expand Up @@ -1314,6 +1320,10 @@ private Builder() {
*/
private Boolean byteOutput = false;

/**
* Default value for {@link SasFileParser#unformatted} variable.
*/
private Boolean unformatted = false;
/**
* The constructor that specifies builders sasFileStream variable.
*
Expand Down Expand Up @@ -1358,6 +1368,17 @@ public Builder byteOutput(Boolean val) {
return this;
}

/**
* The function to specify builders unformatted variable.
*
* @param val value to be set.
* @return result builder.
*/
public Builder unformatted(Boolean val) {
unformatted = val;
return this;
}

/**
* The function to create variable of SasFileParser class using current builder.
*
Expand Down
9 changes: 9 additions & 0 deletions src/main/java/com/epam/parso/impl/SasFileReaderImpl.java
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,15 @@ public SasFileReaderImpl(InputStream inputStream, Boolean byteOutput) {
sasFileParser = new SasFileParser.Builder(inputStream).byteOutput(byteOutput).build();
}

/**
* Builds an object of the SasFileReaderImpl class with use of builder instance provided.
*
* @param builder - sas file parser builder
*/
public SasFileReaderImpl(SasFileParser.Builder builder) {
sasFileParser = builder.build();
}

/**
* The function to get the {@link Column} list from {@link SasFileParser}.
*
Expand Down
8 changes: 5 additions & 3 deletions src/test/java/com/epam/parso/CSVDataWriterUnitTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ public void testData() {
if (resourcesPath != null) {
List<File> files = getSas7bdatFilesList(resourcesPath.getFile() + "//" + FOLDER_NAME);
for (File currentFile : files) {
SasFileReaderUnitTest sasFileReaderUnitTest = new SasFileReaderUnitTest();
sasFileReaderUnitTest.setFileName(FOLDER_NAME + "//" + currentFile.getName());
sasFileReaderUnitTest.testData();
if (currentFile.isFile()) {
SasFileReaderUnitTest sasFileReaderUnitTest = new SasFileReaderUnitTest();
sasFileReaderUnitTest.setFileName(FOLDER_NAME + "//" + currentFile.getName());
sasFileReaderUnitTest.testData();
}
}
}
}
Expand Down
40 changes: 40 additions & 0 deletions src/test/java/com/epam/parso/SasFileReaderUnitTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import au.com.bytecode.opencsv.CSVReader;
import com.epam.parso.impl.CSVDataWriterImpl;
import com.epam.parso.impl.CSVMetadataWriterImpl;
import com.epam.parso.impl.SasFileParser;
import com.epam.parso.impl.SasFileReaderImpl;
import org.junit.Test;
import org.slf4j.Logger;
Expand Down Expand Up @@ -157,6 +158,45 @@ public void testMetadata() {
logger.info("Time passed: {} ms", System.currentTimeMillis() - programStart);
}

@Test
public void testUnformatted() {
long programStart = System.currentTimeMillis();
InputStream fileInputStream = getResourceAsStream(fileName);
logger.info("Processing file {}", fileName);
StringWriter writer = new StringWriter();
InputStreamReader inputStreamReader = new InputStreamReader(
getResourceAsStream(fileName.toLowerCase().replace("sas7bdat/", "csv/").
replace(".sas7bdat", ".csv")));
try {
SasFileParser.Builder builder =
new SasFileParser.Builder(fileInputStream).unformatted(true);
SasFileReader sasFileReader = new SasFileReaderImpl(builder);
long rowCount = sasFileReader.getSasFileProperties().getRowCount();
List<Column> columns = sasFileReader.getColumns();
CSVReader controlReader = new CSVReader(inputStreamReader);
CSVDataWriter csvDataWriter = new CSVDataWriterImpl(writer, ",", "\n", Locale.UK);
controlReader.readNext();
for (int i = 0; i < rowCount; i++) {
csvDataWriter.writeRow(sasFileReader.getColumns(), sasFileReader.readNext());
if (i != 0 && i % COMPARE_ROWS_COUNT == 0) {
compareResultWithControl(controlReader, writer, i - COMPARE_ROWS_COUNT,
columns);
writer.getBuffer().setLength(0);
}
}
compareResultWithControl(controlReader, writer,
(int) (rowCount - rowCount % COMPARE_ROWS_COUNT), columns);
assertThat(controlReader.readNext()).isNull();
} catch (IOException e) {
logger.error(e.getMessage(), e);
} finally {
closeWriter(writer);
closeInputStream(fileInputStream);
closeInputStreamReader(inputStreamReader);
}
logger.info("Time passed: {} ms", System.currentTimeMillis() - programStart);
}

@Test
public void testData() {
long programStart = System.currentTimeMillis();
Expand Down
27 changes: 27 additions & 0 deletions src/test/java/com/epam/parso/UnformattedUnitTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package com.epam.parso;

import static com.epam.parso.TestUtils.getSas7bdatFilesList;

import java.io.File;
import java.net.URL;
import java.util.List;
import org.junit.Test;

public class UnformattedUnitTest {

private static final String FOLDER_NAME = "sas7bdat/unformatted";

@Test
public void testUnformatted() {
URL resourcesPath = this.getClass().getClassLoader().getResource("");
if (resourcesPath != null) {
List<File> files = getSas7bdatFilesList(resourcesPath.getFile() + "//" + FOLDER_NAME);
for (File currentFile : files) {
SasFileReaderUnitTest sasFileReaderUnitTest = new SasFileReaderUnitTest();
sasFileReaderUnitTest.setFileName(FOLDER_NAME + "//" + currentFile.getName());
sasFileReaderUnitTest.testUnformatted();
}
}
}

}
38 changes: 38 additions & 0 deletions src/test/resources/csv/unformatted/all_rand_normal.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
x1,x2,x3,x4,x5,x6,x7,x8
0,1.77283196480627,1.55961845104152,-0.0754841337123781,-0.127310386905666,1.27160971116606,-0.373485088927311,-0.491478299933984
1,0.0196004307029624,-0.413369625750338,-0.168943253776815,0.548823278255197,0.190611907286572,0.461311082111462,-0.183821951945449
2,1.71152031127659,-0.343253067969038,-2.4441579734022,0.771278181748259,-0.790959408495155,-1.1678181616338,-1.42230765453915
3,0.283548642994546,0.506250013338169,0.73547312521291,1.0921602561092,-1.42634104687771,-0.547784365367733,-0.0679134396910904
4,-0.0283595950513087,1.19265624530262,-1.95700253011149,-1.31936124633078,1.13583204842472,-0.773892287552427,0.0306183416824843
5,-1.22009820639777,-0.853206793552464,-1.74368068722334,2.59744290896532,0.283189394665112,1.06409063075838,-1.07707365923793
6,-1.30601106460683,-0.191645716555782,-0.890895281080635,-1.01420554584294,0.239205026896137,-1.36956281497245,0.367116358963696
7,-0.182529555280925,1.70588149747924,-0.648252455053375,-0.113856279856997,-1.59414475661726,-0.0716625039715658,0.422675801034446
8,1.55587487228279,-0.240535685327487,-0.987747968432627,0.379810225477961,-0.616028300364221,-1.87098495560613,0.63376130380024
9,0.747240400228516,0.0760947073326997,-0.755460377522965,0.146127904800772,-1.22177835494399,-0.709271266564636,2.57184269865151
10,-0.279883956321368,1.47435132507343,-0.948816009788528,-0.844456583968411,0.139119584115725,1.35427048218451,-1.27013451653836
20,1.41199698228818,0.836223566992313,0.266081420325081,-1.07577894523886,-0.776099337225336,-0.292190865045215,-0.629139226286168
30,-1.25875867828822,0.185198950004779,-0.116324903204208,-1.03625723019971,-0.0170051390322138,-0.322461408145671,0.00632738879553362
40,0.891389660634871,-0.883007989589512,-0.654361699302352,1.26265202786226,1.13288576340466,-0.0927398141356724,0.0672848024456528
50,2.15000039664857,0.749410953295052,-1.16764981806199,0.466677499486481,-0.0552601977313928,0.0143111301094924,-0.39946607189219
60,-1.93359322664556,0.586069118999608,-0.880669113344459,1.1938351727539,0.00639352792796532,-0.335836488054767,-1.99733626831551
70,-0.354768940788262,1.47138941532739,-1.12631731608776,0.926390501729092,0.648658190157469,0.634162197275018,0.290289574236135
80,0.0835885754002079,-0.625016364355535,0.222253926055733,0.557686378030217,0.195660555797913,1.81749917006098,0.774177285515222
90,0.980586753061465,0.811358110206662,-1.21183157719341,-1.94250131734993,0.610315915827186,0.710791368769147,0.775645090767247
100,-0.885723145885718,0.911684489336932,0.0725798356919812,0.298512405917229,1.15812238929818,-0.0954193025409115,0.105742722666326
200,0.073341242308597,1.59695223746507,1.01675510796357,-0.693741985085777,1.55116578143842,0.0939441882928796,0.604980235424436
300,-0.385784118797266,1.82988283496693,1.12517766054923,0.00111941829389953,-0.618389596735417,-0.406807842285521,-0.528658442128839
400,0.820577114757841,0.521697367895846,-0.694475698023284,-0.309656916502026,-0.5232281841184,-0.432605761053027,0.969883117390241
500,1.01132982744969,-0.136643201992169,-0.144699714119234,-0.617247418502427,-0.894839531612208,0.59543330239254,0.161699890973567
600,0.0173080025830328,1.96583552713622,-0.0975631622996622,1.44967041850502,1.30572126945076,-0.079037431393819,-0.822598726800221
700,1.21325496342318,-1.03384987227854,-1.47023406174145,-0.851374678280646,0.478954438954333,-1.37988316041114,-0.66390791967781
800,-1.45903869393655,0.608575023342467,-0.164410960357209,0.973623590571936,-1.66100298178539,-0.385906285971901,0.791126146572437
900,0.0711062229668704,0.965142014209498,0.923239725814131,0.607529951446925,1.35714544347716,1.27576836692125,1.22171126921072
1000,0.460813274542927,0.176818113472297,-0.920449876629994,1.195208039424,0.749407485199749,1.8103596980305,0.02286574249516
2000,0.347928557650661,-0.00696355715946745,-0.545867059586143,-0.0580779806589241,-0.137701200250452,1.39221029983964,-0.45439960760657
3000,-0.346252286337901,0.194505466564128,2.14087461657989,0.418750181224778,-0.639640747017277,0.12524404451905,0.515966160602984
4000,-0.227185959117449,-1.65507361758266,-2.5212330552349,0.369569218076777,-0.975997367840647,0.681413904874087,-0.0773682593857234
5000,0.171799804525913,-1.51944142865384,-2.14690205526398,2.08776505678444,0.773110345352075,0.584365829771641,0.479143189209444
6000,-0.0999687158096539,-0.224922722661865,-1.65743281784574,-1.13913013099584,1.15087518288969,-0.232357392125228,0.868149746837202
7000,0.0924469148487761,0.68659582758969,0.454186577530023,-0.458935944517113,0.496971538445123,0.259198243908596,-0.690092617039468
8000,0.0324073322397376,-0.307664407305189,0.185706633764026,-1.37539086652721,-0.499079602526934,0.769118023249378,-0.735253974317273
9000,-0.300203031670214,2.7077947819045,0.0378987622204742,1.8252323273999,-1.04103914890652,-0.100344517017144,0.0408366508002207
Loading