presage  0.9.2~beta
contextChangeDetector.cpp
Go to the documentation of this file.
1 
2 /******************************************************
3  * Presage, an extensible predictive text entry system
4  * ---------------------------------------------------
5  *
6  * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
7 
8  This program is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation; either version 2 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License along
19  with this program; if not, write to the Free Software Foundation, Inc.,
20  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  **********(*)*/
23 
24 
25 #include "contextChangeDetector.h"
26 #include "../tokenizer/reverseTokenizer.h"
27 
28 #include <iostream>
29 #include <sstream>
30 #include <stdlib.h> // for atoi()
31 #include <assert.h>
32 
33 const std::string::size_type ContextChangeDetector::DEFAULT_SLIDING_WINDOW_SIZE = 80;
34 
36  const std::string tChars,
37  const std::string bChars,
38  const std::string cChars,
39  bool lowercase)
40  : wordChars (wChars),
41  separatorChars (tChars),
42  blankspaceChars(bChars),
43  controlChars (cChars),
44  lowercase_mode (lowercase)
45 {
46  // intentionally empty
47 }
48 
50 {
51  // intentionally empty
52 }
53 
55 {
56  if(!str.empty()) {
57  SLIDING_WINDOW_SIZE = atoi(str.c_str());
58  } else {
60  }
61 }
62 
64 {
65  if (str.size() <= SLIDING_WINDOW_SIZE) {
66  // past stream fits in sliding window
67  sliding_window = str;
68  } else {
69  // trim past stream down to sliding window
70  sliding_window = str.substr(str.size() - SLIDING_WINDOW_SIZE);
71  assert(sliding_window.size() == SLIDING_WINDOW_SIZE);
72  }
73 }
74 
75 bool ContextChangeDetector::context_change(const std::string& past_stream) const
76 {
77  // Here's how this is going to be implemented... We'll keep a
78  // sliding window on the last few chars seen by presage; the
79  // buffer holding them is the sliding window. We'll search for the
80  // last occurence of sliding_window in past_stream, if any.
81 
82  // If sliding_window is not found in past_stream, then it is not
83  // possible to relate the current context to the previously seen
84  // context, hence we assume a context change has occured.
85 
86  // If sliding_window is found, then we need to examine the chars
87  // following the sliding window in the past stream. We call this
88  // the remainder. If there are any non-word chars in the
89  // remainder, then a context change has occurred. Else, no context
90  // change occured.
91 
92  // The sliding window is never implicitly updated as part of
93  // invoking this method.
94 
95  return context_change_helper(sliding_window, past_stream);
96 }
97 
98 
99 bool ContextChangeDetector::context_change_helper(const std::string& prev_context, const std::string& curr_context) const
100 {
101  bool result = false;
102 
103  if (prev_context.empty()) {
104  if (curr_context.empty()) {
105  // both contexts are empty, nothing has happened, no
106  // change happened
107  result = false;
108  } else {
109  // current context changed, previous context is empty,
110  // first change happened
111  result = true;
112  }
113  } else {
114  // find position of previous context in current context
115  // i.e. find index pointing to last char of last occurence of
116  // prev_context in curr_context
117  std::string::size_type ctx_idx = curr_context.rfind(prev_context);
118 
119  if (ctx_idx == std::string::npos) {
120  // prev_context could not be found in curr_context, a lot
121  // changed
122  result = true;
123  } else {
124  // found prev_context, examine remainder string.
125  // remainder string is substr(ctx_idx +
126  // prev_context.size()); i.e. substring given by index
127  // returned by rfind (which points at beginning of
128  // prev_context string found in curr_context) plus size of
129  // prev_context: this index points at end of prev_context
130  // substring found in curr_context
131 
132  std::string remainder = curr_context.substr(ctx_idx + prev_context.size());
133 
134  std::string::size_type idx = remainder.find_last_of(wordChars);
135  if (idx == std::string::npos) {
136  if (remainder.empty()) {
137  result = false;
138  } else {
139  char last_char = curr_context[ctx_idx + prev_context.size() - 1];
140  idx = wordChars.find(last_char);
141  if (idx == std::string::npos) {
142  result = false;
143  } else {
144  result = true;
145  }
146  }
147  } else {
148  if (idx == remainder.size() - 1) {
149  result = false;
150  } else {
151  result = true;
152  }
153  }
154 
155 
156 /*
157  * alternate implementation of the logic in the enclosing else
158  * block. This uses tokenizers, which is not desirable as it makes
159  * tokenizer a dependency of context change detector.
160 
161  std::string remainder = curr_context.substr(loc + prev_context.size());
162 
163  std::stringstream curr_strstream(curr_context);
164  std::stringstream prev_strstream(prev_context);
165 
166  ReverseTokenizer curr_tokenizer(curr_strstream, blankspaceChars, separatorChars);
167  ReverseTokenizer prev_tokenizer(prev_strstream, blankspaceChars, separatorChars);
168 
169  std::string prev_token = prev_tokenizer.nextToken();
170  std::string curr_token = curr_tokenizer.nextToken();
171 
172  if (curr_token.empty()) {
173  if (prev_token.empty()) {
174  result = false;
175 
176  loc = curr_context.find_first_of(wordChars, loc);
177  if (loc == std::string::npos) {
178  result = false;
179  } else {
180  result = true;
181  }
182 
183  } else {
184  result = true;
185  }
186 
187  } else {
188  loc = curr_token.find(prev_token);
189  if (loc == std::string::npos) {
190  result = true;
191  } else {
192  result = false;
193  }
194  }
195 */
196 
197  }
198  }
199 
200  return result;
201 }
202 
203 std::string ContextChangeDetector::change(const std::string& past_stream) const
204 {
205  const std::string& prev_context = sliding_window; // let's rename these
206  const std::string& curr_context = past_stream; // for clarity's sake
207 
208  std::string result;
209 
210  if (sliding_window.empty()) {
211  result = past_stream;
212  } else {
213  // find position of previous context in current context
214  // i.e. find index pointing to last char of last occurence of
215  // prev_context in curr_context
216  std::string::size_type ctx_idx = curr_context.rfind(prev_context);
217 
218  if (ctx_idx == std::string::npos) {
219  // prev_context could not be found in curr_context, a lot
220  // changed
221  result = past_stream;
222  } else {
223  // found prev_context, examine remainder string.
224  // remainder string is substr(ctx_idx +
225  // prev_context.size()); i.e. substring given by index
226  // returned by rfind (which points at beginning of
227  // prev_context string found in curr_context) plus size of
228  // prev_context: this index points at end of prev_context
229  // substring found in curr_context
230 
231  result = curr_context.substr(ctx_idx + prev_context.size());
232 
233  // handle case where a context change has occured and
234  // remainder string only contains part of the last token,
235  // i.e.:
236  //
237  // sliding_window = "The quick bro";
238  // past_stream = "The quick brown ";
239  //
240  // In this case, the remainder will only contain "wn", and
241  // the last token in the sliding window must be prepended
242  // to the change to be learnt
243  //
244  if (context_change(past_stream)) {
245  // prepend partially entered token to change if it
246  // exists, need to look into sliding_window to get
247  // previously partially entered token if it exists
248  std::stringstream sliding_window_stream;
249  sliding_window_stream << get_sliding_window();
250  ReverseTokenizer rTok(sliding_window_stream,
254  std::string first_token = rTok.nextToken();
255  if (!first_token.empty()) {
256  result = first_token + result;
257  }
258  }
259  }
260  }
261 
262  return result;
263 }
264 
266 {
267  return sliding_window;
268 }
std::string change(const std::string &past_stream) const
const std::string wordChars
const std::string blankspaceChars
bool context_change(const std::string &past_stream) const
std::string get_sliding_window() const
virtual std::string nextToken()
void lowercaseMode(const bool)
Definition: tokenizer.cpp:81
std::string::size_type SLIDING_WINDOW_SIZE
void set_sliding_window_size(const std::string &str)
const std::string separatorChars
ContextChangeDetector(const std::string, const std::string, const std::string, const std::string, bool)
static const std::string::size_type DEFAULT_SLIDING_WINDOW_SIZE
void update_sliding_window(const std::string &str)
bool context_change_helper(const std::string &str1, const std::string &str2) const