35 "SmoothedNgramPredictor, a linear interpolating n-gram predictor",
36 "SmoothedNgramPredictor, long description." ),
40 learn_mode_set (false),
84 std::stringstream ss_deltas(value);
87 while (ss_deltas >> delta) {
88 logger << DEBUG <<
"Pushing delta: " << delta <<
endl;
152 const char separator[] =
"|";
153 std::string result = separator;
155 for (Ngram::const_iterator it = ngram.begin();
159 result += *it + separator;
183 unsigned int result = 0;
186 assert(ngram_size >= 0);
188 if (ngram_size > 0) {
189 Ngram ngram(ngram_size);
190 copy(tokens.end() - ngram_size + offset , tokens.end() + offset, ngram.begin());
195 logger << DEBUG <<
"unigram counts sum: " << result <<
endl;
230 std::vector<std::string> prefixCompletionCandidates;
231 for (
size_t k =
cardinality; (k > 0 && prefixCompletionCandidates.size() < max_partial_prediction_size); k--) {
232 logger << DEBUG <<
"Building partial prefix completion table of cardinality: " << k <<
endl;
234 Ngram prefix_ngram(k);
235 copy(tokens.end() - k, tokens.end(), prefix_ngram.begin());
238 logger << DEBUG <<
"prefix_ngram: ";
239 for (
size_t r = 0; r < prefix_ngram.size(); r++) {
240 logger << DEBUG << prefix_ngram[r] <<
' ';
253 max_partial_prediction_size - prefixCompletionCandidates.size());
258 logger << DEBUG <<
"partial prefixCompletionCandidates" <<
endl 259 << DEBUG <<
"----------------------------------" <<
endl;
260 for (
size_t j = 0; j < partial.size(); j++) {
261 for (
size_t k = 0; k < partial[j].size(); k++) {
262 logger << DEBUG << partial[j][k] <<
" ";
268 logger << DEBUG <<
"Partial prefix completion table contains " << partial.size() <<
" potential completions." <<
endl;
274 std::vector<Ngram>::const_iterator it = partial.begin();
275 while (it != partial.end() && prefixCompletionCandidates.size() < max_partial_prediction_size) {
279 std::string candidate = *(it->end() - 2);
280 if (find(prefixCompletionCandidates.begin(),
281 prefixCompletionCandidates.end(),
282 candidate) == prefixCompletionCandidates.end()) {
283 prefixCompletionCandidates.push_back(candidate);
290 logger << DEBUG <<
"prefixCompletionCandidates" <<
endl 291 << DEBUG <<
"--------------------------" <<
endl;
292 for (
size_t j = 0; j < prefixCompletionCandidates.size(); j++) {
293 logger << DEBUG << prefixCompletionCandidates[j] <<
endl;
303 for (
size_t j = 0; (j < prefixCompletionCandidates.size() && j < max_partial_prediction_size); j++) {
305 tokens[
cardinality - 1] = prefixCompletionCandidates[j];
307 logger << DEBUG <<
"------------------" <<
endl;
310 double probability = 0;
312 double numerator =
count(tokens, 0, k+1);
314 double denominator = (k == 0 ? unigrams_counts_sum :
count(tokens, -1, k));
315 double frequency = ((denominator > 0) ? (numerator / denominator) : 0);
316 probability +=
deltas[k] * frequency;
318 logger << DEBUG <<
"numerator: " << numerator <<
endl;
319 logger << DEBUG <<
"denominator: " << denominator <<
endl;
320 logger << DEBUG <<
"frequency: " << frequency <<
endl;
324 assert(numerator <= denominator);
325 assert(frequency <= 1);
329 logger << DEBUG <<
"probability: " << probability <<
endl;
331 if (probability > 0) {
351 std::map<std::list<std::string>,
int> ngramMap;
355 for (
size_t curr_cardinality = 1;
360 int change_size = change.size();
362 std::list<std::string> ngram_list;
366 (i < curr_cardinality - 1 && change_idx < change_size);
369 ngram_list.push_back(change[change_idx]);
373 while (change_idx < change_size)
375 ngram_list.push_back(change[change_idx++]);
376 ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
377 ngram_list.pop_front();
400 if (change.size() > 0 &&
405 std::list<std::string> ngram_list(change.begin(), change.begin() + 1);
421 logger << DEBUG <<
"Adding extra token: " << extra_token <<
endl;
423 if (extra_token.empty())
427 ngram_list.push_front(extra_token);
429 ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
438 std::map<std::list<std::string>,
int>::const_iterator it;
439 for (it = ngramMap.begin(); it != ngramMap.end(); it++)
442 Ngram ngram((it->first).begin(), (it->first).end());
460 logger << INFO <<
"Committed learning update to database" <<
endl;
465 logger << ERROR <<
"Rolling back learning update : " << ex.
what() <<
endl;
481 size_t size = ngram.size();
482 for (
size_t i = 0; i < size; i++) {
483 if (
count(ngram, -i, size - i) >
count(ngram, -(i + 1), size - (i + 1))) {
484 logger << INFO <<
"consistency adjustment needed!" <<
endl;
486 int offset = -(i + 1);
487 int sub_ngram_size = size - (i + 1);
489 logger << DEBUG <<
"i: " << i <<
" | offset: " << offset <<
" | sub_ngram_size: " << sub_ngram_size <<
endl;
491 Ngram sub_ngram(sub_ngram_size);
492 copy(ngram.end() - sub_ngram_size + offset, ngram.end() + offset, sub_ngram.begin());
495 logger <<
"ngram to be count adjusted is: ";
496 for (
size_t i = 0; i < sub_ngram.size(); i++) {
497 logger << sub_ngram[i] <<
' ';
503 logger << DEBUG <<
"consistency adjusted" <<
endl;
static int toInt(const std::string)
void dispatch(const Observable *var)
~SmoothedNgramPredictor()
Dispatcher< SmoothedNgramPredictor > dispatcher
int getNgramCount(const Ngram ngram) const
void insertNgram(const Ngram ngram, const int count) const
virtual const char * what() const
void set_count_threshold(const std::string &value)
virtual void learn(const std::vector< std::string > &change)
int getUnigramCountsSum() const
void set_database_logger_level(const std::string &level)
virtual void set_logger(const std::string &level)
NgramTable getNgramLikeTable(const Ngram ngram, const char **filter, const int count_threshold, int limit=-1) const
void set_deltas(const std::string &deltas)
const std::string PREDICTORS
static std::string ngram_to_string(const Ngram &ngram)
std::vector< double > deltas
void init_database_connector_if_ready()
static double toDouble(const std::string)
unsigned int count(const std::vector< std::string > &tokens, int offset, int ngram_size) const
Builds the required n-gram and returns its count.
virtual void update(const Observable *variable)
std::vector< Ngram > NgramTable
void set_learn(const std::string &learn_mode)
std::string getExtraTokenToLearn(const int index, const std::vector< std::string > &change) const
void check_learn_consistency(const Ngram &name) const
virtual std::string get_name() const =0
SmoothedNgramPredictor(Configuration *, ContextTracker *, const char *)
int incrementNgramCount(const Ngram ngram) const
virtual void beginTransaction() const
void updateNgram(const Ngram ngram, const int count) const
void map(Observable *var, const mbr_func_ptr_t &ptr)
virtual void rollbackTransaction() const
ContextTracker * contextTracker
std::string COUNT_THRESHOLD
virtual void endTransaction() const
void addSuggestion(Suggestion)
static bool isYes(const char *)
std::string DATABASE_LOGGER
virtual Prediction predict(const size_t size, const char **filter) const
Generate prediction.
Tracks user interaction and context.
virtual std::string get_value() const =0
void set_dbfilename(const std::string &filename)
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)
std::string getToken(const int) const