Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Stopwords Fix PR #1012

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
16 changes: 16 additions & 0 deletions common/client/src/main/java/zingg/common/client/HasStopWords.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package zingg.common.client;

import scala.Serializable;

public class HasStopWords implements Serializable {

public static boolean isStopwordField(FieldDefinition f){
if(!(f.getStopWords() == null || f.getStopWords() == "")){
return true;
}
else {
return false;
Fixed Show fixed Hide fixed
}
Comment on lines +8 to +13

Check warning

Code scanning / PMD

This if statement can be replaced by `return {condition};`

This if statement can be replaced by `return {condition};`
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import zingg.common.client.FieldDefinition;
import zingg.common.client.IArguments;
import zingg.common.client.IZArgs;
import zingg.common.client.MatchType;
import zingg.common.client.ZFrame;
import zingg.common.client.ZinggClientException;
Expand Down Expand Up @@ -251,7 +250,7 @@ public List<FieldDefinition> getFieldDefinitionFiltered(IArguments args, MatchT
.stream()
.filter(f -> !(f.getMatchType() == null || f.getMatchType().contains(type)))
.collect(Collectors.toList());
}
}

public ZFrame<D,R,C> postprocess(ZFrame<D,R,C> actual, ZFrame<D,R,C> orig) {
List<C> cols = new ArrayList<C>();
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package zingg.common.client.util;

import java.io.Serializable;
import java.util.ArrayList;
import java.util.List;
import java.util.stream.Collectors;

import zingg.common.client.FieldDefinition;
import zingg.common.client.HasStopWords;
import zingg.common.client.IArguments;

public class StopWordFieldDefUtility implements Serializable {

private static final long serialVersionUID = 1L;

public List<? extends FieldDefinition> getFieldDefinitionWithStopwords(List<? extends FieldDefinition> fieldDefinition) {
return fieldDefinition.stream()
.filter(f -> HasStopWords.isStopwordField(f))
.collect(Collectors.toList());
}

public String getFieldDefinitionNamesWithStopwords(IArguments args) {
List<FieldDefinition> list = args.getFieldDefinition()
sania-16 marked this conversation as resolved.
Show resolved Hide resolved
.stream()
.filter(f -> HasStopWords.isStopwordField(f))
.collect(Collectors.toList());

List<String> fieldNamesList = new ArrayList<String>();
for(FieldDefinition fd: list){
fieldNamesList.add(fd.getName());
}

String fieldNames = fieldNamesList.stream()
Fixed Show fixed Hide fixed
.collect(Collectors.joining(", "));
return fieldNames;
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
package zingg.common.client.util;

import static org.junit.jupiter.api.Assertions.assertEquals;

import java.util.ArrayList;
import java.util.List;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.junit.jupiter.api.Test;

import zingg.common.client.Arguments;
import zingg.common.client.FieldDefinition;
import zingg.common.client.IArguments;
import zingg.common.client.MatchType;
import zingg.common.client.ZinggClientException;

public class TestStopWordFieldDefUtility {

private static final Log LOG = LogFactory.getLog(TestStopWordFieldDefUtility.class);
Fixed Show fixed Hide fixed
Fixed Show fixed Hide fixed

StopWordFieldDefUtility stopWordFieldDefUtil = new StopWordFieldDefUtility();

@Test
public void testGetFieldDefinitionWithStopwords(){
sania-16 marked this conversation as resolved.
Show resolved Hide resolved
try {
FieldDefinition def1 = new FieldDefinition();
def1.setFieldName("field1");
def1.setDataType("string");
def1.setMatchTypeInternal(MatchType.FUZZY);
def1.setFields("field1");

FieldDefinition def2 = new FieldDefinition();
def2.setFieldName("field2");
def2.setDataType("string");
def2.setMatchTypeInternal(MatchType.EXACT);
def2.setStopWords("stopWordsFileName2");
def2.setFields("field2");

FieldDefinition def3 = new FieldDefinition();
def3.setFieldName("field3");
def3.setDataType("string");
def3.setMatchTypeInternal(MatchType.FUZZY);
def3.setStopWords("");
def3.setFields("field3");

List<FieldDefinition> fieldDef = new ArrayList<FieldDefinition>();
fieldDef.add(def1);
fieldDef.add(def2);
fieldDef.add(def3);

List<? extends FieldDefinition> stopWordList = stopWordFieldDefUtil.getFieldDefinitionWithStopwords(fieldDef);
assertEquals(stopWordList.size(), 1);

} catch (Exception e) {
e.printStackTrace();

}

}

@Test
public void testGetFieldDefinitionNamesWithStopwords() throws ZinggClientException{
FieldDefinition def1 = new FieldDefinition();
def1.setFieldName("field1");
def1.setDataType("string");
def1.setMatchTypeInternal(MatchType.FUZZY);
def1.setFields("field1");

FieldDefinition def2 = new FieldDefinition();
def2.setFieldName("field2");
def2.setDataType("string");
def2.setMatchTypeInternal(MatchType.EXACT);
def2.setStopWords("stopWordsFileName2");
def2.setFields("field2");

FieldDefinition def3 = new FieldDefinition();
def3.setFieldName("field3");
def3.setDataType("string");
def3.setMatchTypeInternal(MatchType.FUZZY);
def3.setStopWords("stopWordsFileName3");
def3.setFields("field3");

List<FieldDefinition> fieldDef = new ArrayList<FieldDefinition>();
fieldDef.add(def1);
fieldDef.add(def2);
fieldDef.add(def3);
IArguments args = null;
try {
args = new Arguments();
args.setFieldDefinition(fieldDef);
} catch (Exception e) {
e.printStackTrace();
}

String result = stopWordFieldDefUtil.getFieldDefinitionNamesWithStopwords(args);
assertEquals("field2, field3", result);

}

}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
import zingg.common.client.ZinggClientException;
import zingg.common.client.util.ColName;
import zingg.common.client.util.ColValues;
import zingg.common.client.util.StopWordFieldDefUtility;
import zingg.common.core.context.IContext;
import zingg.common.core.util.Analytics;
import zingg.common.core.util.Metric;
Expand All @@ -22,6 +23,7 @@ public abstract class ZinggBase<S,D, R, C, T> extends ZinggBaseCommon<S, D, R, C
protected String name;
protected long startTime;
protected ClientOptions clientOptions;
protected StopWordFieldDefUtility stopWordFieldDefUtility = new StopWordFieldDefUtility();
sania-16 marked this conversation as resolved.
Show resolved Hide resolved

public static final Log LOG = LogFactory.getLog(ZinggBase.class);

Expand Down Expand Up @@ -71,23 +73,17 @@ public void setSession(S s) {
}




public void track( boolean collectMetrics){
public void track(boolean collectMetrics){
Analytics.track(Metric.TOTAL_FIELDS_COUNT, args.getFieldDefinition().size(), collectMetrics);
Analytics.track(Metric.MATCH_FIELDS_COUNT, getDSUtil().getFieldDefinitionFiltered(args, MatchType.DONT_USE).size(),
collectMetrics);
Analytics.track(Metric.MATCH_FIELDS_COUNT, getDSUtil().getFieldDefinitionFiltered(args, MatchType.DONT_USE).size(), collectMetrics);
Analytics.track(Metric.DATA_FORMAT, getPipeUtil().getPipesAsString(args.getData()), collectMetrics);
Analytics.track(Metric.OUTPUT_FORMAT, getPipeUtil().getPipesAsString(args.getOutput()), collectMetrics);
Analytics.track(Metric.MODEL_ID, args.getModelId(), collectMetrics);

Analytics.track(Metric.STOPWORDS, stopWordFieldDefUtility.getFieldDefinitionNamesWithStopwords(args), collectMetrics);

}





public IContext<S,D,R,C,T> getContext() {
return this.context;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ public class Metric {
public static final String TRAINING_MATCHES = "trainingDataMatches";
public static final String TRAINING_NONMATCHES = "trainingDataNonmatches";
public static final String DATA_COUNT = "dataCount";
public static final String STOPWORDS = "stopWords";

public static final long timeout = 1200L;
public static final double confidence = 0.95; // default value
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,4 +148,5 @@ public void testGetFieldDefColumnsWhenShowConciseIsFalse() throws ZinggClientExc
assertTrue(expectedColumnsTest2.get(i).contains(colListTest2.get(i).toString()));
};
}

}
Loading