49 #include "../lib/predictors/dbconnector/sqliteDatabaseConnector.h" 58 int main(
int argc,
char* argv[])
69 const std::string TABBED_SEPARATED_VALUES =
"tsv";
70 const std::string SQLITE =
"sqlite";
72 std::string format = TABBED_SEPARATED_VALUES;
75 bool lowercase =
false;
82 const char *
const short_options =
"n:o:f:alhv";
83 const struct option long_options[] =
85 {
"ngrams", required_argument, 0,
'n' },
86 {
"output", required_argument, 0,
'o' },
87 {
"format", required_argument, 0,
'f' },
88 {
"append", no_argument, 0,
'a' },
89 {
"lowercase", no_argument, 0,
'l' },
90 {
"help", no_argument, 0,
'h' },
91 {
"version", no_argument, 0,
'v' },
96 next_option = getopt_long(argc,
102 switch (next_option) {
104 if (atoi(optarg) > 0) {
105 ngrams = atoi(optarg);
115 || optarg == TABBED_SEPARATED_VALUES) {
145 std::cerr <<
"Error: unhandled option." <<
std::endl;
149 }
while (next_option != -1);
152 if ((argc - optind < 1)) {
159 std::map<NgramList, int> ngramMap;
161 for (
int i = optind; i < argc; i++) {
168 std::cout <<
"Parsing " << argv[i] <<
"..." 174 std::ifstream infile(argv[i]);
177 "`~!@#$%^&*()_-+=\\|]}[{'\";:/?.>,<");
181 for (
int i = 0; (i < ngrams - 1 && tokenizer.
hasMoreTokens()); i++) {
190 ngram.push_back(token);
193 ngramMap[ngram] = ngramMap[ngram] + 1;
207 std::cout <<
"Writing out to " << format <<
" format file " 209 if (format == TABBED_SEPARATED_VALUES) {
213 std::ofstream *outstream = 0;
214 std::ostream *prev_outstream = 0;
216 if (output.c_str()) {
218 outstream =
new std::ofstream (output.c_str(), std::ios::out);
220 prev_outstream = std::cout.tie (outstream);
225 long total = ngramMap.size();
227 std::map<NgramList, int>::const_iterator it;
228 for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
229 for (NgramList::const_iterator ngram_it = it->first.begin();
230 ngram_it != it->first.end();
232 std::cout << *ngram_it <<
'\t';
235 progressBar.
update(static_cast<double>(count++)/total);
238 if (output.c_str()) {
239 std::cout.tie (prev_outstream);
244 }
else if (format == SQLITE) {
254 long total = ngramMap.size();
256 std::map<NgramList, int>::const_iterator it;
257 for (it = ngramMap.begin(); it != ngramMap.end(); it++) {
261 for (NgramList::const_iterator jt = it->first.begin();
262 jt != it->first.end();
264 ngram.push_back(*jt);
273 sqliteDbCntr.
updateNgram(ngram, count + it->second);
283 progressBar.
update(static_cast<double>(count++)/total);
301 <<
"Copyright (C) Matteo Vescovi" <<
std::endl 302 <<
"This is free software; see the source for copying conditions. There is NO" <<
std::endl 303 <<
"warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." <<
std::endl 313 <<
" --output, -o O " <<
"Output file name O" <<
std::endl 314 <<
" --ngrams, -n N " <<
"Specify ngram cardinality N" <<
std::endl 315 <<
" --format, -f F " <<
"Output file format F: sqlite, tsv (tabbed separated values)" <<
std::endl 316 <<
" --lowercase, -l " <<
"Enable lowercase conversion mode" <<
std::endl 317 <<
" --append, -a " <<
"Open output file in append mode" <<
std::endl 318 <<
" --help, -h " <<
"Display this information" <<
std::endl 319 <<
" --version, -v " <<
"Show version information" <<
std::endl 322 <<
"Send bug reports to " << PACKAGE_BUGREPORT <<
std::endl 323 <<
"Copyright (C) Matteo Vescovi" <<
std::endl;
int getNgramCount(const Ngram ngram) const
void insertNgram(const Ngram ngram, const int count) const
void update(const double percentage)
virtual double progress() const
virtual bool hasMoreTokens() const
const std::string PROGRAM_NAME
std::list< std::string > NgramList
void lowercaseMode(const bool)
virtual std::string nextToken()
int main(int argc, char *argv[])
virtual void beginTransaction() const
void updateNgram(const Ngram ngram, const int count) const
void createNgramTable(const size_t cardinality) const
virtual void endTransaction() const
const Logger< _charT, _Traits > & endl(const Logger< _charT, _Traits > &lgr)