presage  0.9.2~beta
contextTracker.cpp
Go to the documentation of this file.
1 
2 /******************************************************
3  * Presage, an extensible predictive text entry system
4  * ---------------------------------------------------
5  *
6  * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
7 
8  This program is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation; either version 2 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License along
19  with this program; if not, write to the Free Software Foundation, Inc.,
20  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  **********(*)*/
23 
24 
25 #include "contextTracker.h"
26 #include "../utility.h"
27 #include "../predictorRegistry.h"
28 #include "../tokenizer/forwardTokenizer.h"
29 
30 #include <stdlib.h> // for atoi()
31 
32 const char* ContextTracker::LOGGER = "Presage.ContextTracker.LOGGER";
33 const char* ContextTracker::SLIDING_WINDOW_SIZE = "Presage.ContextTracker.SLIDING_WINDOW_SIZE";
34 const char* ContextTracker::LOWERCASE_MODE = "Presage.ContextTracker.LOWERCASE_MODE";
35 const char* ContextTracker::ONLINE_LEARNING = "Presage.ContextTracker.ONLINE_LEARNING";
36 
38  PredictorRegistry* registry,
39  PresageCallback* callback,
40  const char wChars[],
41  const char tChars[],
42  const char bChars[],
43  const char cChars[])
44  : wordChars (wChars),
45  separatorChars (tChars),
46  blankspaceChars(bChars),
47  controlChars (cChars),
48  predictorRegistry (registry),
49  logger ("ContextTracker", std::cerr),
50  //tokenizer (pastStream, blankspaceChars, separatorChars),
51  lowercase_mode (true),
52  dispatcher (this)
53 {
54  if (callback) {
56  } else {
57  throw new PresageException(PRESAGE_INVALID_CALLBACK_ERROR, "Invalid callback object");
58  }
59 
65 
66  // set pointer to this context tracker in predictor registry so that
67  // predictors can be constructed when next iterator is requested
68  //
69  if (predictorRegistry) {
71  }
72 
73  // build dispatch map
78 }
79 
81 {
82  delete contextChangeDetector;
83 }
84 
85 void ContextTracker::set_logger (const std::string& value)
86 {
87  logger << setlevel (value);
88  logger << INFO << "LOGGER: " << value << endl;
89 }
90 
91 void ContextTracker::set_sliding_window_size (const std::string& value)
92 {
94  logger << INFO << "SLIDING_WINDOWS_SIZE: " << value << endl;
95 }
96 
97 void ContextTracker::set_lowercase_mode (const std::string& value)
98 {
100  logger << INFO << "LOWERCASE_MODE: " << value << endl;
101 }
102 
103 void ContextTracker::set_online_learning(const std::string& value)
104 {
106  logger << INFO << "ONLINE_LEARNING: " << value << endl;
107 }
108 
110 {
112  if (new_callback) {
113  context_tracker_callback = new_callback;
114  }
115  return result;
116 }
117 
122 {
124 }
125 
127 {
128  // detect change that needs to be learned
129  std::string change = contextChangeDetector->change(getPastStream());
130 
131  if (online_learning)
132  {
133  learn (change);
134  }
135 
136  // update sliding window
138 }
139 
140 void ContextTracker::learn(const std::string& text) const
141 {
142  logger << INFO << "learn(): text: " << text << endl;
143 
144  std::stringstream stream_to_learn(text);
145 
146  // split stream up into tokens
147  std::vector<std::string> tokens;
148  ForwardTokenizer tok(stream_to_learn,
152  logger << INFO << "learn(): tokenized change: ";
153  while (tok.hasMoreTokens()) {
154  std::string token = tok.nextToken();
155  tokens.push_back(token);
156  logger << INFO << token << '|';
157  }
158  logger << INFO << endl;
159 
160  if (! tokens.empty()) {
161  // remove prefix (partially entered token or empty token)
162  tokens.pop_back();
163  }
164 
165  if ((logger << INFO).shouldLog())
166  {
167  logger << "learn(): sanitized change: ";
168  for (std::vector<std::string>::const_iterator it = tokens.begin();
169  it != tokens.end();
170  it++) {
171  logger << *it << '|';
172  }
173  logger << endl;
174  }
175 
176  // time to learn
178  Predictor* predictor = 0;
179 
180  while (it.hasNext()) {
181  predictor = it.next();
182  predictor->learn(tokens);
183  }
184 }
185 
186 std::string ContextTracker::getPrefix() const
187 {
188  return getToken(0);
189 }
190 
191 std::string ContextTracker::getToken(const int index) const
192 {
193  std::stringstream pastStringStream(context_tracker_callback->get_past_stream());
194  ReverseTokenizer tokenizer(pastStringStream, blankspaceChars, separatorChars);
195  tokenizer.lowercaseMode(lowercase_mode);
196 
197  std::string token;
198  int i = 0;
199  while (tokenizer.hasMoreTokens() && i <= index) {
200  token = tokenizer.nextToken();
201  i++;
202  }
203  if (i <= index) {
204  // in case the index points too far back
205  token = "";
206  }
207  return token;
208 
210 // "a b c"
211 // 2 1 0
212 // 0 1 2
213 // 1 2 3
214 //
215 // ForwardTokenizer tokenizer(pastStream, blankspaceChars, separatorChars);
216 // tokenizer.lowercaseMode(lowercase_mode);
217 // std::string result;
218 // int tokens = tokenizer.countTokens();
219 // // why oh why is this clear() required to get it to work???
220 // pastStream.clear();
221 // int j = 0;
222 // while (tokenizer.hasMoreTokens() && j < tokens - index) {
223 // result = tokenizer.nextToken();
224 // j++;
225 //
226 // std::cerr << "ContextTracker::getToken() current token: " << result << std::endl;
227 // }
228 // return result;
229 }
230 
231 std::string ContextTracker::getExtraTokenToLearn(const int index, const std::vector<std::string>& change) const
232 {
233  //logger << DEBUG
234  // << "past_stream : " << getPastStream() << endl
235  // << "change : " << contextChangeDetector->change(getPastStream()) << endl
236  // << "sliding_window: " << contextChangeDetector->get_sliding_window() + "\n" << endl;
237 
238 
239  // Extra tokens to learn are to be found in (past_stream - change)
240  //
241  // The change tokens are tokens that have not been seen or learnt
242  // before.
243  //
244  // The extra tokens to learn are tokens that have been seen and
245  // learn before, but that we need to reuse to fill out the n-gram
246  // of required cardinality that we are about to learn.
247  //
248  // To find the extra tokens to learn, we use the size of tokenized
249  // change vector to offset the index and extract the extra tokens
250  // to learn from the past stream.
251  //
252  // For example:
253  // past_stream : "The quick brown fox jumped over the "
254  // change : |over|the|
255  // extra_tokens: |The|quick|brown|fox|jumped|
256  //
257  return getToken(index + change.size());
258 }
259 
261 {
263 }
264 
265 std::string ContextTracker::getPastStream() const
266 {
267  std::string result = context_tracker_callback->get_past_stream();
268  return result;
269 }
270 
271 bool ContextTracker::isCompletionValid(const std::string& completion) const
272 {
273  bool result = false;
274 
275  std::string prefix = getPrefix();
276  prefix = Utility::strtolower(prefix); // no need to be case sensitive
277  if (completion.find(prefix) == 0) {
278  result = true;
279  }
280 
281  return result;
282 }
283 
284 bool ContextTracker::isWordChar(const char c) const
285 {
286  if(wordChars.find(c, 0) != std::string::npos)
287  return true;
288  else
289  return false;
290 }
291 
292 bool ContextTracker::isSeparatorChar(const char c) const
293 {
294  if(separatorChars.find(c, 0) != std::string::npos)
295  return true;
296  else
297  return false;
298 }
299 
300 bool ContextTracker::isBlankspaceChar(const char c) const
301 {
302  if(blankspaceChars.find(c, 0) != std::string::npos)
303  return true;
304  else
305  return false;
306 }
307 
308 bool ContextTracker::isControlChar(const char c) const
309 {
310  if(controlChars.find(c, 0) != std::string::npos)
311  return true;
312  else
313  return false;
314 }
315 
316 std::string ContextTracker::getWordChars() const
317 {
318  return wordChars;
319 }
320 
322 {
323  return separatorChars;
324 }
325 
327 {
328  return blankspaceChars;
329 }
330 
332 {
333  return controlChars;
334 }
335 
336 std::string ContextTracker::toString() const
337 {
339 }
340 
341 void ContextTracker::update (const Observable* variable)
342 {
343  logger << DEBUG << "Notification received: "
344  << variable->get_name () << " - " << variable->get_value () << endl;
345 
346  dispatcher.dispatch (variable);
347 }
const PresageCallback * context_tracker_callback
std::string blankspaceChars
static const char * LOGGER
ContextChangeDetector * contextChangeDetector
std::string change(const std::string &past_stream) const
void learn(const std::string &text) const
Learn from text.
void set_online_learning(const std::string &value)
PredictorRegistry * predictorRegistry
bool isBlankspaceChar(const char) const
void dispatch(const Observable *var)
Definition: dispatcher.h:73
static const char * ONLINE_LEARNING
std::string toString() const
std::string getControlChars() const
std::string getBlankspaceChars() const
std::string getPastStream() const
void set_lowercase_mode(const std::string &value)
const PresageCallback * callback(const PresageCallback *callback)
std::string controlChars
bool isControlChar(const char) const
_SetLevel setlevel(std::string __l)
Manipulator for level.
Definition: logger.h:46
bool context_change(const std::string &past_stream) const
bool isCompletionValid(const std::string &) const
std::string getSeparatorChars() const
virtual bool hasMoreTokens() const
virtual void learn(const std::vector< std::string > &change)=0
void set_sliding_window_size(const std::string &value)
std::string config
Definition: presageDemo.cpp:70
std::string getFutureStream() const
std::string wordChars
virtual std::string get_past_stream() const =0
ContextTracker(Configuration *config, PredictorRegistry *predictorRegistry, PresageCallback *callback, const char[]=DEFAULT_WORD_CHARS, const char[]=DEFAULT_SEPARATOR_CHARS, const char[]=DEFAULT_BLANKSPACE_CHARS, const char[]=DEFAULT_CONTROL_CHARS)
void setContextTracker(ContextTracker *ct)
void lowercaseMode(const bool)
Definition: tokenizer.cpp:81
std::string getExtraTokenToLearn(const int index, const std::vector< std::string > &change) const
virtual std::string get_name() const =0
virtual std::string nextToken()
void map(Observable *var, const mbr_func_ptr_t &ptr)
Definition: dispatcher.h:62
static char * strtolower(char *)
Definition: utility.cpp:42
static const char * SLIDING_WINDOW_SIZE
static const char * LOWERCASE_MODE
Dispatcher< ContextTracker > dispatcher
static bool isYes(const char *)
Definition: utility.cpp:185
void set_sliding_window_size(const std::string &str)
bool isWordChar(const char) const
void set_logger(const std::string &value)
std::string separatorChars
virtual std::string get_value() const =0
std::string getPrefix() const
bool isSeparatorChar(const char) const
Logger< char > logger
std::string getWordChars() const
virtual std::string get_future_stream() const =0
void update_sliding_window(const std::string &str)
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)
Definition: logger.h:278
std::string getToken(const int) const