libstdc++
|
00001 // class template regex -*- C++ -*- 00002 00003 // Copyright (C) 2013-2016 Free Software Foundation, Inc. 00004 // 00005 // This file is part of the GNU ISO C++ Library. This library is free 00006 // software; you can redistribute it and/or modify it under the 00007 // terms of the GNU General Public License as published by the 00008 // Free Software Foundation; either version 3, or (at your option) 00009 // any later version. 00010 00011 // This library is distributed in the hope that it will be useful, 00012 // but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 // GNU General Public License for more details. 00015 00016 // Under Section 7 of GPL version 3, you are granted additional 00017 // permissions described in the GCC Runtime Library Exception, version 00018 // 3.1, as published by the Free Software Foundation. 00019 00020 // You should have received a copy of the GNU General Public License and 00021 // a copy of the GCC Runtime Library Exception along with this program; 00022 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see 00023 // <http://www.gnu.org/licenses/>. 00024 00025 /** 00026 * @file bits/regex_automaton.h 00027 * This is an internal header file, included by other library headers. 00028 * Do not attempt to use it directly. @headername{regex} 00029 */ 00030 00031 // This macro defines the maximal state number a NFA can have. 00032 #ifndef _GLIBCXX_REGEX_STATE_LIMIT 00033 #define _GLIBCXX_REGEX_STATE_LIMIT 100000 00034 #endif 00035 00036 namespace std _GLIBCXX_VISIBILITY(default) 00037 { 00038 namespace __detail 00039 { 00040 _GLIBCXX_BEGIN_NAMESPACE_VERSION 00041 00042 /** 00043 * @defgroup regex-detail Base and Implementation Classes 00044 * @ingroup regex 00045 * @{ 00046 */ 00047 00048 typedef long _StateIdT; 00049 static const _StateIdT _S_invalid_state_id = -1; 00050 00051 template<typename _CharT> 00052 using _Matcher = std::function<bool (_CharT)>; 00053 00054 /// Operation codes that define the type of transitions within the base NFA 00055 /// that represents the regular expression. 00056 enum _Opcode : int 00057 { 00058 _S_opcode_unknown, 00059 _S_opcode_alternative, 00060 _S_opcode_repeat, 00061 _S_opcode_backref, 00062 _S_opcode_line_begin_assertion, 00063 _S_opcode_line_end_assertion, 00064 _S_opcode_word_boundary, 00065 _S_opcode_subexpr_lookahead, 00066 _S_opcode_subexpr_begin, 00067 _S_opcode_subexpr_end, 00068 _S_opcode_dummy, 00069 _S_opcode_match, 00070 _S_opcode_accept, 00071 }; 00072 00073 struct _State_base 00074 { 00075 protected: 00076 _Opcode _M_opcode; // type of outgoing transition 00077 00078 public: 00079 _StateIdT _M_next; // outgoing transition 00080 union // Since they are mutually exclusive. 00081 { 00082 size_t _M_subexpr; // for _S_opcode_subexpr_* 00083 size_t _M_backref_index; // for _S_opcode_backref 00084 struct 00085 { 00086 // for _S_opcode_alternative, _S_opcode_repeat and 00087 // _S_opcode_subexpr_lookahead 00088 _StateIdT _M_alt; 00089 // for _S_opcode_word_boundary or _S_opcode_subexpr_lookahead or 00090 // quantifiers (ungreedy if set true) 00091 bool _M_neg; 00092 }; 00093 // For _S_opcode_match 00094 __gnu_cxx::__aligned_membuf<_Matcher<char>> _M_matcher_storage; 00095 }; 00096 00097 protected: 00098 explicit _State_base(_Opcode __opcode) 00099 : _M_opcode(__opcode), _M_next(_S_invalid_state_id) 00100 { } 00101 00102 public: 00103 bool 00104 _M_has_alt() 00105 { 00106 return _M_opcode == _S_opcode_alternative 00107 || _M_opcode == _S_opcode_repeat 00108 || _M_opcode == _S_opcode_subexpr_lookahead; 00109 } 00110 00111 #ifdef _GLIBCXX_DEBUG 00112 std::ostream& 00113 _M_print(std::ostream& ostr) const; 00114 00115 // Prints graphviz dot commands for state. 00116 std::ostream& 00117 _M_dot(std::ostream& __ostr, _StateIdT __id) const; 00118 #endif 00119 }; 00120 00121 template<typename _Char_type> 00122 struct _State : _State_base 00123 { 00124 typedef _Matcher<_Char_type> _MatcherT; 00125 static_assert(sizeof(_MatcherT) == sizeof(_Matcher<char>), 00126 "std::function<bool(T)> has the same size as " 00127 "std::function<bool(char)>"); 00128 static_assert(alignof(_MatcherT) == alignof(_Matcher<char>), 00129 "std::function<bool(T)> has the same alignment as " 00130 "std::function<bool(char)>"); 00131 00132 explicit 00133 _State(_Opcode __opcode) : _State_base(__opcode) 00134 { 00135 if (_M_opcode() == _S_opcode_match) 00136 new (this->_M_matcher_storage._M_addr()) _MatcherT(); 00137 } 00138 00139 _State(const _State& __rhs) : _State_base(__rhs) 00140 { 00141 if (__rhs._M_opcode() == _S_opcode_match) 00142 new (this->_M_matcher_storage._M_addr()) 00143 _MatcherT(__rhs._M_get_matcher()); 00144 } 00145 00146 _State(_State&& __rhs) : _State_base(__rhs) 00147 { 00148 if (__rhs._M_opcode() == _S_opcode_match) 00149 new (this->_M_matcher_storage._M_addr()) 00150 _MatcherT(std::move(__rhs._M_get_matcher())); 00151 } 00152 00153 _State& 00154 operator=(const _State&) = delete; 00155 00156 ~_State() 00157 { 00158 if (_M_opcode() == _S_opcode_match) 00159 _M_get_matcher().~_MatcherT(); 00160 } 00161 00162 // Since correct ctor and dtor rely on _M_opcode, it's better not to 00163 // change it over time. 00164 _Opcode 00165 _M_opcode() const 00166 { return _State_base::_M_opcode; } 00167 00168 bool 00169 _M_matches(_Char_type __char) const 00170 { return _M_get_matcher()(__char); } 00171 00172 _MatcherT& 00173 _M_get_matcher() 00174 { return *static_cast<_MatcherT*>(this->_M_matcher_storage._M_addr()); } 00175 00176 const _MatcherT& 00177 _M_get_matcher() const 00178 { 00179 return *static_cast<const _MatcherT*>( 00180 this->_M_matcher_storage._M_addr()); 00181 } 00182 }; 00183 00184 struct _NFA_base 00185 { 00186 typedef size_t _SizeT; 00187 typedef regex_constants::syntax_option_type _FlagT; 00188 00189 explicit 00190 _NFA_base(_FlagT __f) 00191 : _M_flags(__f), _M_start_state(0), _M_subexpr_count(0), 00192 _M_has_backref(false) 00193 { } 00194 00195 _NFA_base(_NFA_base&&) = default; 00196 00197 protected: 00198 ~_NFA_base() = default; 00199 00200 public: 00201 _FlagT 00202 _M_options() const 00203 { return _M_flags; } 00204 00205 _StateIdT 00206 _M_start() const 00207 { return _M_start_state; } 00208 00209 _SizeT 00210 _M_sub_count() const 00211 { return _M_subexpr_count; } 00212 00213 std::vector<size_t> _M_paren_stack; 00214 _FlagT _M_flags; 00215 _StateIdT _M_start_state; 00216 _SizeT _M_subexpr_count; 00217 bool _M_has_backref; 00218 }; 00219 00220 template<typename _TraitsT> 00221 struct _NFA 00222 : _NFA_base, std::vector<_State<typename _TraitsT::char_type>> 00223 { 00224 typedef typename _TraitsT::char_type _Char_type; 00225 typedef _State<_Char_type> _StateT; 00226 typedef _Matcher<_Char_type> _MatcherT; 00227 00228 _NFA(const typename _TraitsT::locale_type& __loc, _FlagT __flags) 00229 : _NFA_base(__flags) 00230 { _M_traits.imbue(__loc); } 00231 00232 // for performance reasons _NFA objects should only be moved not copied 00233 _NFA(const _NFA&) = delete; 00234 _NFA(_NFA&&) = default; 00235 00236 _StateIdT 00237 _M_insert_accept() 00238 { 00239 auto __ret = _M_insert_state(_StateT(_S_opcode_accept)); 00240 return __ret; 00241 } 00242 00243 _StateIdT 00244 _M_insert_alt(_StateIdT __next, _StateIdT __alt, bool __neg) 00245 { 00246 _StateT __tmp(_S_opcode_alternative); 00247 // It labels every quantifier to make greedy comparison easier in BFS 00248 // approach. 00249 __tmp._M_next = __next; 00250 __tmp._M_alt = __alt; 00251 return _M_insert_state(std::move(__tmp)); 00252 } 00253 00254 _StateIdT 00255 _M_insert_repeat(_StateIdT __next, _StateIdT __alt, bool __neg) 00256 { 00257 _StateT __tmp(_S_opcode_repeat); 00258 // It labels every quantifier to make greedy comparison easier in BFS 00259 // approach. 00260 __tmp._M_next = __next; 00261 __tmp._M_alt = __alt; 00262 __tmp._M_neg = __neg; 00263 return _M_insert_state(std::move(__tmp)); 00264 } 00265 00266 _StateIdT 00267 _M_insert_matcher(_MatcherT __m) 00268 { 00269 _StateT __tmp(_S_opcode_match); 00270 __tmp._M_get_matcher() = std::move(__m); 00271 return _M_insert_state(std::move(__tmp)); 00272 } 00273 00274 _StateIdT 00275 _M_insert_subexpr_begin() 00276 { 00277 auto __id = this->_M_subexpr_count++; 00278 this->_M_paren_stack.push_back(__id); 00279 _StateT __tmp(_S_opcode_subexpr_begin); 00280 __tmp._M_subexpr = __id; 00281 return _M_insert_state(std::move(__tmp)); 00282 } 00283 00284 _StateIdT 00285 _M_insert_subexpr_end() 00286 { 00287 _StateT __tmp(_S_opcode_subexpr_end); 00288 __tmp._M_subexpr = this->_M_paren_stack.back(); 00289 this->_M_paren_stack.pop_back(); 00290 return _M_insert_state(std::move(__tmp)); 00291 } 00292 00293 _StateIdT 00294 _M_insert_backref(size_t __index); 00295 00296 _StateIdT 00297 _M_insert_line_begin() 00298 { return _M_insert_state(_StateT(_S_opcode_line_begin_assertion)); } 00299 00300 _StateIdT 00301 _M_insert_line_end() 00302 { return _M_insert_state(_StateT(_S_opcode_line_end_assertion)); } 00303 00304 _StateIdT 00305 _M_insert_word_bound(bool __neg) 00306 { 00307 _StateT __tmp(_S_opcode_word_boundary); 00308 __tmp._M_neg = __neg; 00309 return _M_insert_state(std::move(__tmp)); 00310 } 00311 00312 _StateIdT 00313 _M_insert_lookahead(_StateIdT __alt, bool __neg) 00314 { 00315 _StateT __tmp(_S_opcode_subexpr_lookahead); 00316 __tmp._M_alt = __alt; 00317 __tmp._M_neg = __neg; 00318 return _M_insert_state(std::move(__tmp)); 00319 } 00320 00321 _StateIdT 00322 _M_insert_dummy() 00323 { return _M_insert_state(_StateT(_S_opcode_dummy)); } 00324 00325 _StateIdT 00326 _M_insert_state(_StateT __s) 00327 { 00328 this->push_back(std::move(__s)); 00329 if (this->size() > _GLIBCXX_REGEX_STATE_LIMIT) 00330 __throw_regex_error( 00331 regex_constants::error_space, 00332 "Number of NFA states exceeds limit. Please use shorter regex " 00333 "string, or use smaller brace expression, or make " 00334 "_GLIBCXX_REGEX_STATE_LIMIT larger."); 00335 return this->size()-1; 00336 } 00337 00338 // Eliminate dummy node in this NFA to make it compact. 00339 void 00340 _M_eliminate_dummy(); 00341 00342 #ifdef _GLIBCXX_DEBUG 00343 std::ostream& 00344 _M_dot(std::ostream& __ostr) const; 00345 #endif 00346 public: 00347 _TraitsT _M_traits; 00348 }; 00349 00350 /// Describes a sequence of one or more %_State, its current start 00351 /// and end(s). This structure contains fragments of an NFA during 00352 /// construction. 00353 template<typename _TraitsT> 00354 class _StateSeq 00355 { 00356 public: 00357 typedef _NFA<_TraitsT> _RegexT; 00358 00359 public: 00360 _StateSeq(_RegexT& __nfa, _StateIdT __s) 00361 : _M_nfa(__nfa), _M_start(__s), _M_end(__s) 00362 { } 00363 00364 _StateSeq(_RegexT& __nfa, _StateIdT __s, _StateIdT __end) 00365 : _M_nfa(__nfa), _M_start(__s), _M_end(__end) 00366 { } 00367 00368 // Append a state on *this and change *this to the new sequence. 00369 void 00370 _M_append(_StateIdT __id) 00371 { 00372 _M_nfa[_M_end]._M_next = __id; 00373 _M_end = __id; 00374 } 00375 00376 // Append a sequence on *this and change *this to the new sequence. 00377 void 00378 _M_append(const _StateSeq& __s) 00379 { 00380 _M_nfa[_M_end]._M_next = __s._M_start; 00381 _M_end = __s._M_end; 00382 } 00383 00384 // Clones an entire sequence. 00385 _StateSeq 00386 _M_clone(); 00387 00388 public: 00389 _RegexT& _M_nfa; 00390 _StateIdT _M_start; 00391 _StateIdT _M_end; 00392 }; 00393 00394 //@} regex-detail 00395 _GLIBCXX_END_NAMESPACE_VERSION 00396 } // namespace __detail 00397 } // namespace std 00398 00399 #include <bits/regex_automaton.tcc>