presage  0.9.2~beta
text2ngram.cpp
Go to the documentation of this file.
1 
2 /******************************************************
3  * Presage, an extensible predictive text entry system
4  * ---------------------------------------------------
5  *
6  * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
7 
8  This program is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation; either version 2 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License along
19  with this program; if not, write to the Free Software Foundation, Inc.,
20  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  **********(*)*/
23 
24 
25 #include "config.h"
26 
27 #include <iostream>
28 #include <fstream>
29 #include <vector>
30 #include <list>
31 #include <string>
32 #include <map>
33 
34 #ifdef HAVE_UNISTD_H
35 # include <unistd.h>
36 #endif
37 
38 #ifdef HAVE_STDLIB_H
39 # include <stdlib.h>
40 #endif
41 
42 #include <getopt.h>
43 #include <assert.h>
44 
46 #include "core/iso8859_1.h"
47 #include "core/progress.h"
48 
49 #include "../lib/predictors/dbconnector/sqliteDatabaseConnector.h"
50 
51 const std::string PROGRAM_NAME = "text2ngram";
52 
53 typedef std::list<std::string> NgramList;
54 
55 void usage();
56 void version();
57 
58 int main(int argc, char* argv[])
59 {
60  int next_option;
61 
62  // Setup some defaults
63  // - default to generating 1-gram counts
64  int ngrams = 1;
65 
66  // - default output to stdout (empty string signifies stdout)
67  std::string output;
68 
69  const std::string TABBED_SEPARATED_VALUES = "tsv";
70  const std::string SQLITE = "sqlite";
71  // - default format is tabbed separated values
72  std::string format = TABBED_SEPARATED_VALUES;
73 
74  // - default to case sensitive
75  bool lowercase = false;
76 
77  // - default to no append
78  bool append = false;
79 
80 
81  // getopt structures
82  const char * const short_options = "n:o:f:alhv";
83  const struct option long_options[] =
84  {
85  { "ngrams", required_argument, 0, 'n' },
86  { "output", required_argument, 0, 'o' },
87  { "format", required_argument, 0, 'f' },
88  { "append", no_argument, 0, 'a' },
89  { "lowercase", no_argument, 0, 'l' },
90  { "help", no_argument, 0, 'h' },
91  { "version", no_argument, 0, 'v' },
92  { 0, 0, 0, 0 }
93  };
94 
95  do {
96  next_option = getopt_long(argc,
97  argv,
98  short_options,
99  long_options,
100  NULL);
101 
102  switch (next_option) {
103  case 'n': // --ngrams or -n option
104  if (atoi(optarg) > 0) {
105  ngrams = atoi(optarg);
106  } else {
107  usage();
108  }
109  break;
110  case 'o': // --output or -o option
111  output = optarg;
112  break;
113  case 'f': // --format or -f option
114  if (optarg == SQLITE
115  || optarg == TABBED_SEPARATED_VALUES) {
116  format = optarg;
117  } else {
118  std::cerr << "Unknown format " << optarg << std::endl << std::endl;
119  usage();
120  return -1;
121  }
122  break;
123  case 'a': // --append or -a option
124  // append mode
125  append = true;
126  break;
127  case 'l': // --lowercase or -l option
128  lowercase = true;
129  break;
130  case 'h': // --help or -h option
131  usage();
132  exit (0);
133  break;
134  case 'v': // --version or -v option
135  version();
136  exit (0);
137  break;
138  case '?': // unknown option
139  usage();
140  exit (0);
141  break;
142  case -1:
143  break;
144  default:
145  std::cerr << "Error: unhandled option." << std::endl;
146  exit(0);
147  }
148 
149  } while (next_option != -1);
150 
151 
152  if ((argc - optind < 1)) {
153  usage();
154  return -1;
155  }
156 
157 
158  // ngramMap stores <token,count> pairs
159  std::map<NgramList, int> ngramMap;
160 
161  for (int i = optind; i < argc; i++) {
162  // do the actual processing file by file
163  std::string token;
164  NgramList ngram;
165 
166  // points to output file
167  // print out file information
168  std::cout << "Parsing " << argv[i] << "..."
169  << std::endl;
170 
171  ProgressBar<char> progressBar;
172 
173  // create tokenizer object and open input file stream
174  std::ifstream infile(argv[i]);
175  ForwardTokenizer tokenizer(infile,
176  " \f\n\r\t\v",
177  "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<");
178  tokenizer.lowercaseMode(lowercase);
179 
180  // take care of first N-1 tokens
181  for (int i = 0; (i < ngrams - 1 && tokenizer.hasMoreTokens()); i++) {
182  ngram.push_back(tokenizer.nextToken());
183  }
184 
185  while (tokenizer.hasMoreTokens()) {
186  // extract token from input stream
187  token = tokenizer.nextToken();
188 
189  // update ngram with new token
190  ngram.push_back(token);
191 
192  // update map with new token occurrence
193  ngramMap[ngram] = ngramMap[ngram] + 1;
194 
195  // update progress bar
196  //progressBar(tokenizer.progress());
197  progressBar.update(tokenizer.progress());
198 
199  // remove front token from ngram
200  ngram.pop_front();
201  }
202 
203  infile.close();
204  }
205 
206 
207  std::cout << "Writing out to " << format << " format file "
208  << output << "..." << std::endl;
209  if (format == TABBED_SEPARATED_VALUES) {
210  // output to tabbed separated values text file
211  //
212 
213  std::ofstream *outstream = 0;
214  std::ostream *prev_outstream = 0;
215 
216  if (output.c_str()) {
217  // tie outstream to file
218  outstream = new std::ofstream (output.c_str(), std::ios::out);
219  assert(outstream);
220  prev_outstream = std::cout.tie (outstream);
221  }
222 
223  // write results to output stream
224  ProgressBar<char> progressBar;
225  long total = ngramMap.size();
226  long count = 0;
227  std::map<NgramList, int>::const_iterator it;
228  for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
229  for (NgramList::const_iterator ngram_it = it->first.begin();
230  ngram_it != it->first.end();
231  ngram_it++) {
232  std::cout << *ngram_it << '\t';
233  }
234  std::cout << it->second << std::endl;
235  progressBar.update(static_cast<double>(count++)/total);
236  }
237 
238  if (output.c_str()) {
239  std::cout.tie (prev_outstream);
240  outstream->close ();
241  delete outstream;
242  }
243 
244  } else if (format == SQLITE) {
245  // output to SQLITE
246  //
247 
248  SqliteDatabaseConnector sqliteDbCntr(output, ngrams, true);
249  sqliteDbCntr.beginTransaction();
250  sqliteDbCntr.createNgramTable(ngrams);
251 
252  // write results to output stream
253  ProgressBar<char> progressBar;
254  long total = ngramMap.size();
255  long count = 0;
256  std::map<NgramList, int>::const_iterator it;
257  for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
258 
259  // convert from NgramList to Ngram
260  Ngram ngram;
261  for (NgramList::const_iterator jt = it->first.begin();
262  jt != it->first.end();
263  jt++) {
264  ngram.push_back(*jt);
265  }
266 
267  if (append) {
268  // need to check whether ngram is already in database.
269  // when appending to existing database
270  int count = sqliteDbCntr.getNgramCount(ngram);
271  if (count > 0) {
272  // ngram already in database, update count
273  sqliteDbCntr.updateNgram(ngram, count + it->second);
274  } else {
275  // ngram not in database, insert it
276  sqliteDbCntr.insertNgram(ngram, it->second);
277  }
278  } else {
279  // insert ngram
280  sqliteDbCntr.insertNgram(ngram, it->second);
281  }
282 
283  progressBar.update(static_cast<double>(count++)/total);
284  }
285  sqliteDbCntr.endTransaction();
286  } else {
287  abort();
288  }
289 
290 
291  std::cout << std::endl;
292 
293  return 0;
294 }
295 
296 
297 void version()
298 {
299  std::cout
300  << PROGRAM_NAME << " (" << PACKAGE << ") version " << VERSION << std::endl
301  << "Copyright (C) Matteo Vescovi" << std::endl
302  << "This is free software; see the source for copying conditions. There is NO" << std::endl
303  << "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." << std::endl
304  << std::endl;
305 }
306 
307 
308 void usage()
309 {
310  std::cout
311  << "Usage: " << PROGRAM_NAME << " [OPTION]... infiles..." << std::endl
312  << std::endl
313  << " --output, -o O " << "Output file name O" << std::endl
314  << " --ngrams, -n N " << "Specify ngram cardinality N" << std::endl
315  << " --format, -f F " << "Output file format F: sqlite, tsv (tabbed separated values)" << std::endl
316  << " --lowercase, -l " << "Enable lowercase conversion mode" << std::endl
317  << " --append, -a " << "Open output file in append mode" << std::endl
318  << " --help, -h " << "Display this information" << std::endl
319  << " --version, -v " << "Show version information" << std::endl
320  << std::endl
321  << PROGRAM_NAME << " is free software distributed under the GPL." << std::endl
322  << "Send bug reports to " << PACKAGE_BUGREPORT << std::endl
323  << "Copyright (C) Matteo Vescovi" << std::endl;
324 }
int getNgramCount(const Ngram ngram) const
void insertNgram(const Ngram ngram, const int count) const
void update(const double percentage)
Definition: progress.h:54
virtual double progress() const
virtual bool hasMoreTokens() const
const std::string PROGRAM_NAME
Definition: text2ngram.cpp:51
std::list< std::string > NgramList
Definition: text2ngram.cpp:53
void lowercaseMode(const bool)
Definition: tokenizer.cpp:81
virtual std::string nextToken()
int main(int argc, char *argv[])
Definition: text2ngram.cpp:58
virtual void beginTransaction() const
void updateNgram(const Ngram ngram, const int count) const
void createNgramTable(const size_t cardinality) const
virtual void endTransaction() const
void usage()
Definition: text2ngram.cpp:308
void version()
Definition: text2ngram.cpp:297
Definition: ngram.h:33
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)
Definition: logger.h:278