presage  0.9.2~beta
tokenizer.h
Go to the documentation of this file.
1 
2 /******************************************************
3  * Presage, an extensible predictive text entry system
4  * ---------------------------------------------------
5  *
6  * Copyright (C) 2008 Matteo Vescovi <matteo.vescovi@yahoo.co.uk>
7 
8  This program is free software; you can redistribute it and/or modify
9  it under the terms of the GNU General Public License as published by
10  the Free Software Foundation; either version 2 of the License, or
11  (at your option) any later version.
12 
13  This program is distributed in the hope that it will be useful,
14  but WITHOUT ANY WARRANTY; without even the implied warranty of
15  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  GNU General Public License for more details.
17 
18  You should have received a copy of the GNU General Public License along
19  with this program; if not, write to the Free Software Foundation, Inc.,
20  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
21  *
22  **********(*)*/
23 
24 
25 #ifndef PRESAGE_TOKENIZER
26 #define PRESAGE_TOKENIZER
27 
28 #ifdef HAVE_CONFIG_H
29 #include "config.h"
30 #endif
31 
32 #include <iostream>
33 #include <istream>
34 #include <string>
35 #include <assert.h>
36 
64 class Tokenizer {
65 public:
66  Tokenizer(std::istream& stream,
67  const std::string blankspaces,
68  const std::string separators );
69  virtual ~Tokenizer();
70 
73  virtual int countTokens() = 0;
74 
77  virtual bool hasMoreTokens() const = 0;
78 
81  virtual std::string nextToken() = 0;
82 
85  virtual double progress() const = 0;
86 
87 
90  void blankspaceChars(const std::string);
93  std::string blankspaceChars() const;
94 
97  void separatorChars(const std::string);
100  std::string separatorChars() const;
101 
104  void lowercaseMode(const bool);
107  bool lowercaseMode() const;
108 
109  std::string streamToString() const {
110  std::streamoff offbackup = stream.tellg();
111  std::string str;
112  std::streamoff curroff = offbeg;
113  stream.seekg(curroff);
114  while (curroff < offend) {
115  stream.clear();
116  str.push_back(stream.peek());
117  curroff++;
118  stream.seekg(curroff);
119  }
120  stream.seekg(offbackup);
121  return str;
122  }
123 
124 protected:
125  class StreamGuard {
126  public:
127  StreamGuard(std::istream& so, std::streamoff& of)
128  : guardedStream(so) {
129  currstate = guardedStream.rdstate();
130  curroff = guardedStream.tellg();
131  guardedStream.seekg (of );
132  }
134  guardedStream.seekg (curroff );
135  guardedStream.setstate(currstate);
136  }
137 
138  private:
139  std::istream& guardedStream;
140  std::ios::iostate currstate;
141  std::streamoff curroff;
142  };
143 
144  std::istream& stream;
145  std::ios::iostate sstate;
146  std::streamoff offbeg;
147  std::streamoff offend;
148  std::streamoff offset;
149 
150  bool isBlankspace(const int character) const;
151  bool isSeparator (const int character) const;
152 
153 private:
154  std::string blankspaces;
155  std::string separators;
156 
157  bool lowercase;
158 };
159 
160 #endif // PRESAGE_TOKENIZER
Tokenizer(std::istream &stream, const std::string blankspaces, const std::string separators)
Definition: tokenizer.cpp:27
bool isBlankspace(const int character) const
Definition: tokenizer.cpp:91
StreamGuard(std::istream &so, std::streamoff &of)
Definition: tokenizer.h:127
bool lowercaseMode() const
Definition: tokenizer.cpp:86
virtual int countTokens()=0
std::string separators
Definition: tokenizer.h:155
virtual double progress() const =0
std::streamoff curroff
Definition: tokenizer.h:141
std::string separatorChars() const
Definition: tokenizer.cpp:76
virtual std::string nextToken()=0
std::streamoff offset
Definition: tokenizer.h:148
std::streamoff offend
Definition: tokenizer.h:147
std::istream & stream
Definition: tokenizer.h:144
std::istream & guardedStream
Definition: tokenizer.h:139
virtual bool hasMoreTokens() const =0
std::ios::iostate currstate
Definition: tokenizer.h:140
bool isSeparator(const int character) const
Definition: tokenizer.cpp:101
std::streamoff offbeg
Definition: tokenizer.h:146
std::string blankspaceChars() const
Definition: tokenizer.cpp:66
std::string blankspaces
Definition: tokenizer.h:154
bool lowercase
Definition: tokenizer.h:157
std::string streamToString() const
Definition: tokenizer.h:109
virtual ~Tokenizer()
Definition: tokenizer.cpp:53
std::ios::iostate sstate
Definition: tokenizer.h:145