Skip to content

Commit

Permalink
Merge pull request #2076 from jplag/feature/GSTOutOfBoundFix
Browse files Browse the repository at this point in the history
Implemented GreedyStringTiling workaround
  • Loading branch information
tsaglam authored Jan 16, 2025
2 parents 7d7fca2 + 4ba27ae commit a10a2d6
Showing 1 changed file with 24 additions and 2 deletions.
26 changes: 24 additions & 2 deletions core/src/main/java/de/jplag/GreedyStringTiling.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import java.util.stream.Collectors;

import de.jplag.options.JPlagOptions;

Expand All @@ -30,6 +31,15 @@ public class GreedyStringTiling {
private final Map<Submission, int[]> cachedTokenValueLists = new IdentityHashMap<>();
private final Map<Submission, SubsequenceHashLookupTable> cachedHashLookupTables = new IdentityHashMap<>();

private static final String ERROR_INDEX_OUT_OF_BOUNDS = """
GST index out of bounds. This is probably a random issue caused by multithreading issues.
Length of the list that caused the exception (the list of marks for the relevant submission): %s, Index in that list: %s
TokenCount: %s, TokenList: %s
CachedTokenCount: %s
Submission (cause of error): %s
Submission (other): %s
""".trim().stripIndent();

public GreedyStringTiling(JPlagOptions options) {
this.options = options;
// Ensures 1 <= neighborLength <= minimumTokenMatch
Expand Down Expand Up @@ -115,14 +125,16 @@ private JPlagComparison compareInternal(Submission leftSubmission, Submission ri
List<Match> iterationMatches = new ArrayList<>();
for (int leftStartIndex = 0; leftStartIndex < leftValues.length - maximumMatchLength; leftStartIndex++) {
int leftSubsequenceHash = leftLookupTable.subsequenceHashForStartIndex(leftStartIndex);
if (leftMarked[leftStartIndex] || leftSubsequenceHash == SubsequenceHashLookupTable.NO_HASH) {
if (checkMark(leftMarked, leftStartIndex, leftSubmission, rightSubmission)
|| leftSubsequenceHash == SubsequenceHashLookupTable.NO_HASH) {
continue;
}
List<Integer> possiblyMatchingRightStartIndexes = rightLookupTable
.startIndexesOfPossiblyMatchingSubsequencesForSubsequenceHash(leftSubsequenceHash);
for (Integer rightStartIndex : possiblyMatchingRightStartIndexes) {
// comparison uses >= because it is assumed that the last token is a pivot (FILE_END)
if (rightMarked[rightStartIndex] || maximumMatchLength >= rightValues.length - rightStartIndex) {
if (checkMark(rightMarked, rightStartIndex, rightSubmission, leftSubmission)
|| maximumMatchLength >= rightValues.length - rightStartIndex) {
continue;
}

Expand Down Expand Up @@ -228,4 +240,14 @@ private int[] tokenValueListFromSubmission(Submission submission) {
return tokenValueList;
}));
}

private boolean checkMark(boolean[] marks, int index, Submission submission, Submission otherSubmission) {
if (index >= marks.length) {
throw new IllegalStateException(String.format(ERROR_INDEX_OUT_OF_BOUNDS, marks.length, index, submission.getTokenList().size(),
submission.getTokenList().stream().map(it -> it.getType().getDescription()).collect(Collectors.joining(", ")),
cachedTokenValueLists.get(submission).length, submission.getName(), otherSubmission.getName()));
}

return marks[index];
}
}

0 comments on commit a10a2d6

Please sign in to comment.