-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnGramScorer.cpp
49 lines (47 loc) · 1.97 KB
/
nGramScorer.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
#include <cmath>
#include <sstream>
#include <unordered_map>
#include "nGramScorer.h"
nGramScorer::nGramScorer(std::ifstream file) {
std::string line{};
if (file.is_open()) {
nGramFrequencies.reserve(389373); // Pre-allocate space for the n-gram frequencies map.
while (getline(file, line)) {
// Extract the n-gram and its total occurrence from the line
std::stringstream lineStream(line);
std::string ngram;
int nGramTotalOccurrence;
lineStream >> ngram >> nGramTotalOccurrence;
nGramLength = (int) ngram.length();
sumTotalOccurrence += nGramTotalOccurrence;
// Store the n-gram frequency in the map
nGramFrequencies[ngram] = nGramTotalOccurrence;
}
file.close();
}
double logSum = log10(sumTotalOccurrence);
for (auto &elementPair : nGramFrequencies) {
nGramFrequencies[elementPair.first] = log10(elementPair.second) - logSum;
}
}
double nGramScorer::score(const std::string &text) {
double score = 0;
double floor = log10(0.01) - log10(sumTotalOccurrence);
// Get pointers to the start and end of the text
const char* textStart = text.data();
const char* textEnd = textStart + text.size();
// Iterate through each n-gram in the text
// Continue until current character plus the n-gram length is <= the end of the text.
for (const char* currentChar = textStart; currentChar <= textEnd - nGramLength; currentChar++) {
// Look up the n-gram frequency in the map
auto it = nGramFrequencies.find(std::string(currentChar, currentChar + nGramLength));
if (it != nGramFrequencies.end()){
// If the n-gram is found, add its frequency to the score
score += it->second;
} else {
// If the n-gram is not found, add the floor value to the score
score += floor;
}
}
return score;
}