basic_regex_parser.hpp 110 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961296229632964296529662967296829692970297129722973297429752976297729782979298029812982298329842985298629872988298929902991299229932994299529962997299829993000300130023003300430053006300730083009301030113012301330143015301630173018301930203021302230233024302530263027302830293030303130323033303430353036303730383039304030413042304330443045304630473048304930503051305230533054305530563057305830593060306130623063306430653066306730683069307030713072307330743075307630773078307930803081308230833084308530863087308830893090309130923093309430953096309730983099310031013102310331043105310631073108310931103111311231133114311531163117311831193120312131223123312431253126312731283129313031313132313331343135313631373138313931403141
  1. /*
  2. *
  3. * Copyright (c) 2004
  4. * John Maddock
  5. *
  6. * Use, modification and distribution are subject to the
  7. * Boost Software License, Version 1.0. (See accompanying file
  8. * LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  9. *
  10. */
  11. /*
  12. * LOCATION: see http://www.boost.org for most recent version.
  13. * FILE basic_regex_parser.cpp
  14. * VERSION see <boost/version.hpp>
  15. * DESCRIPTION: Declares template class basic_regex_parser.
  16. */
  17. #ifndef BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
  18. #define BOOST_REGEX_V4_BASIC_REGEX_PARSER_HPP
  19. #ifdef BOOST_MSVC
  20. #pragma warning(push)
  21. #pragma warning(disable: 4103)
  22. #endif
  23. #ifdef BOOST_HAS_ABI_HEADERS
  24. # include BOOST_ABI_PREFIX
  25. #endif
  26. #ifdef BOOST_MSVC
  27. #pragma warning(pop)
  28. #endif
  29. namespace boost{
  30. namespace BOOST_REGEX_DETAIL_NS{
  31. #ifdef BOOST_MSVC
  32. #pragma warning(push)
  33. #pragma warning(disable:4244 4800)
  34. #endif
  35. inline boost::intmax_t umax(mpl::false_ const&)
  36. {
  37. // Get out clause here, just in case numeric_limits is unspecialized:
  38. return std::numeric_limits<boost::intmax_t>::is_specialized ? (std::numeric_limits<boost::intmax_t>::max)() : INT_MAX;
  39. }
  40. inline boost::intmax_t umax(mpl::true_ const&)
  41. {
  42. return (std::numeric_limits<std::size_t>::max)();
  43. }
  44. inline boost::intmax_t umax()
  45. {
  46. return umax(mpl::bool_<std::numeric_limits<boost::intmax_t>::digits >= std::numeric_limits<std::size_t>::digits>());
  47. }
  48. template <class charT, class traits>
  49. class basic_regex_parser : public basic_regex_creator<charT, traits>
  50. {
  51. public:
  52. basic_regex_parser(regex_data<charT, traits>* data);
  53. void parse(const charT* p1, const charT* p2, unsigned flags);
  54. void fail(regex_constants::error_type error_code, std::ptrdiff_t position);
  55. void fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos);
  56. void fail(regex_constants::error_type error_code, std::ptrdiff_t position, const std::string& message)
  57. {
  58. fail(error_code, position, message, position);
  59. }
  60. bool parse_all();
  61. bool parse_basic();
  62. bool parse_extended();
  63. bool parse_literal();
  64. bool parse_open_paren();
  65. bool parse_basic_escape();
  66. bool parse_extended_escape();
  67. bool parse_match_any();
  68. bool parse_repeat(std::size_t low = 0, std::size_t high = (std::numeric_limits<std::size_t>::max)());
  69. bool parse_repeat_range(bool isbasic);
  70. bool parse_alt();
  71. bool parse_set();
  72. bool parse_backref();
  73. void parse_set_literal(basic_char_set<charT, traits>& char_set);
  74. bool parse_inner_set(basic_char_set<charT, traits>& char_set);
  75. bool parse_QE();
  76. bool parse_perl_extension();
  77. bool parse_perl_verb();
  78. bool match_verb(const char*);
  79. bool add_emacs_code(bool negate);
  80. bool unwind_alts(std::ptrdiff_t last_paren_start);
  81. digraph<charT> get_next_set_literal(basic_char_set<charT, traits>& char_set);
  82. charT unescape_character();
  83. regex_constants::syntax_option_type parse_options();
  84. private:
  85. typedef bool (basic_regex_parser::*parser_proc_type)();
  86. typedef typename traits::string_type string_type;
  87. typedef typename traits::char_class_type char_class_type;
  88. parser_proc_type m_parser_proc; // the main parser to use
  89. const charT* m_base; // the start of the string being parsed
  90. const charT* m_end; // the end of the string being parsed
  91. const charT* m_position; // our current parser position
  92. unsigned m_mark_count; // how many sub-expressions we have
  93. int m_mark_reset; // used to indicate that we're inside a (?|...) block.
  94. unsigned m_max_mark; // largest mark count seen inside a (?|...) block.
  95. std::ptrdiff_t m_paren_start; // where the last seen ')' began (where repeats are inserted).
  96. std::ptrdiff_t m_alt_insert_point; // where to insert the next alternative
  97. bool m_has_case_change; // true if somewhere in the current block the case has changed
  98. unsigned m_recursion_count; // How many times we've called parse_all.
  99. #if defined(BOOST_MSVC) && defined(_M_IX86)
  100. // This is an ugly warning suppression workaround (for warnings *inside* std::vector
  101. // that can not otherwise be suppressed)...
  102. BOOST_STATIC_ASSERT(sizeof(long) >= sizeof(void*));
  103. std::vector<long> m_alt_jumps; // list of alternative in the current scope.
  104. #else
  105. std::vector<std::ptrdiff_t> m_alt_jumps; // list of alternative in the current scope.
  106. #endif
  107. basic_regex_parser& operator=(const basic_regex_parser&);
  108. basic_regex_parser(const basic_regex_parser&);
  109. };
  110. template <class charT, class traits>
  111. basic_regex_parser<charT, traits>::basic_regex_parser(regex_data<charT, traits>* data)
  112. : basic_regex_creator<charT, traits>(data), m_mark_count(0), m_mark_reset(-1), m_max_mark(0), m_paren_start(0), m_alt_insert_point(0), m_has_case_change(false), m_recursion_count(0)
  113. {
  114. }
  115. template <class charT, class traits>
  116. void basic_regex_parser<charT, traits>::parse(const charT* p1, const charT* p2, unsigned l_flags)
  117. {
  118. // pass l_flags on to base class:
  119. this->init(l_flags);
  120. // set up pointers:
  121. m_position = m_base = p1;
  122. m_end = p2;
  123. // empty strings are errors:
  124. if((p1 == p2) &&
  125. (
  126. ((l_flags & regbase::main_option_type) != regbase::perl_syntax_group)
  127. || (l_flags & regbase::no_empty_expressions)
  128. )
  129. )
  130. {
  131. fail(regex_constants::error_empty, 0);
  132. return;
  133. }
  134. // select which parser to use:
  135. switch(l_flags & regbase::main_option_type)
  136. {
  137. case regbase::perl_syntax_group:
  138. {
  139. m_parser_proc = &basic_regex_parser<charT, traits>::parse_extended;
  140. //
  141. // Add a leading paren with index zero to give recursions a target:
  142. //
  143. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
  144. br->index = 0;
  145. br->icase = this->flags() & regbase::icase;
  146. break;
  147. }
  148. case regbase::basic_syntax_group:
  149. m_parser_proc = &basic_regex_parser<charT, traits>::parse_basic;
  150. break;
  151. case regbase::literal:
  152. m_parser_proc = &basic_regex_parser<charT, traits>::parse_literal;
  153. break;
  154. default:
  155. // Ooops, someone has managed to set more than one of the main option flags,
  156. // so this must be an error:
  157. fail(regex_constants::error_unknown, 0, "An invalid combination of regular expression syntax flags was used.");
  158. return;
  159. }
  160. // parse all our characters:
  161. bool result = parse_all();
  162. //
  163. // Unwind our alternatives:
  164. //
  165. unwind_alts(-1);
  166. // reset l_flags as a global scope (?imsx) may have altered them:
  167. this->flags(l_flags);
  168. // if we haven't gobbled up all the characters then we must
  169. // have had an unexpected ')' :
  170. if(!result)
  171. {
  172. fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_position), "Found a closing ) with no corresponding openening parenthesis.");
  173. return;
  174. }
  175. // if an error has been set then give up now:
  176. if(this->m_pdata->m_status)
  177. return;
  178. // fill in our sub-expression count:
  179. this->m_pdata->m_mark_count = 1 + m_mark_count;
  180. this->finalize(p1, p2);
  181. }
  182. template <class charT, class traits>
  183. void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position)
  184. {
  185. // get the error message:
  186. std::string message = this->m_pdata->m_ptraits->error_string(error_code);
  187. fail(error_code, position, message);
  188. }
  189. template <class charT, class traits>
  190. void basic_regex_parser<charT, traits>::fail(regex_constants::error_type error_code, std::ptrdiff_t position, std::string message, std::ptrdiff_t start_pos)
  191. {
  192. if(0 == this->m_pdata->m_status) // update the error code if not already set
  193. this->m_pdata->m_status = error_code;
  194. m_position = m_end; // don't bother parsing anything else
  195. #ifndef BOOST_NO_TEMPLATED_ITERATOR_CONSTRUCTORS
  196. //
  197. // Augment error message with the regular expression text:
  198. //
  199. if(start_pos == position)
  200. start_pos = (std::max)(static_cast<std::ptrdiff_t>(0), position - static_cast<std::ptrdiff_t>(10));
  201. std::ptrdiff_t end_pos = (std::min)(position + static_cast<std::ptrdiff_t>(10), static_cast<std::ptrdiff_t>(m_end - m_base));
  202. if(error_code != regex_constants::error_empty)
  203. {
  204. if((start_pos != 0) || (end_pos != (m_end - m_base)))
  205. message += " The error occurred while parsing the regular expression fragment: '";
  206. else
  207. message += " The error occurred while parsing the regular expression: '";
  208. if(start_pos != end_pos)
  209. {
  210. message += std::string(m_base + start_pos, m_base + position);
  211. message += ">>>HERE>>>";
  212. message += std::string(m_base + position, m_base + end_pos);
  213. }
  214. message += "'.";
  215. }
  216. #endif
  217. #ifndef BOOST_NO_EXCEPTIONS
  218. if(0 == (this->flags() & regex_constants::no_except))
  219. {
  220. boost::regex_error e(message, error_code, position);
  221. e.raise();
  222. }
  223. #else
  224. (void)position; // suppress warnings.
  225. #endif
  226. }
  227. template <class charT, class traits>
  228. bool basic_regex_parser<charT, traits>::parse_all()
  229. {
  230. if (++m_recursion_count > 400)
  231. {
  232. // exceeded internal limits
  233. fail(boost::regex_constants::error_complexity, m_position - m_base, "Exceeded nested brace limit.");
  234. }
  235. bool result = true;
  236. while(result && (m_position != m_end))
  237. {
  238. result = (this->*m_parser_proc)();
  239. }
  240. --m_recursion_count;
  241. return result;
  242. }
  243. #ifdef BOOST_MSVC
  244. #pragma warning(push)
  245. #pragma warning(disable:4702)
  246. #endif
  247. template <class charT, class traits>
  248. bool basic_regex_parser<charT, traits>::parse_basic()
  249. {
  250. switch(this->m_traits.syntax_type(*m_position))
  251. {
  252. case regex_constants::syntax_escape:
  253. return parse_basic_escape();
  254. case regex_constants::syntax_dot:
  255. return parse_match_any();
  256. case regex_constants::syntax_caret:
  257. ++m_position;
  258. this->append_state(syntax_element_start_line);
  259. break;
  260. case regex_constants::syntax_dollar:
  261. ++m_position;
  262. this->append_state(syntax_element_end_line);
  263. break;
  264. case regex_constants::syntax_star:
  265. if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line))
  266. return parse_literal();
  267. else
  268. {
  269. ++m_position;
  270. return parse_repeat();
  271. }
  272. case regex_constants::syntax_plus:
  273. if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
  274. return parse_literal();
  275. else
  276. {
  277. ++m_position;
  278. return parse_repeat(1);
  279. }
  280. case regex_constants::syntax_question:
  281. if(!(this->m_last_state) || (this->m_last_state->type == syntax_element_start_line) || !(this->flags() & regbase::emacs_ex))
  282. return parse_literal();
  283. else
  284. {
  285. ++m_position;
  286. return parse_repeat(0, 1);
  287. }
  288. case regex_constants::syntax_open_set:
  289. return parse_set();
  290. case regex_constants::syntax_newline:
  291. if(this->flags() & regbase::newline_alt)
  292. return parse_alt();
  293. else
  294. return parse_literal();
  295. default:
  296. return parse_literal();
  297. }
  298. return true;
  299. }
  300. template <class charT, class traits>
  301. bool basic_regex_parser<charT, traits>::parse_extended()
  302. {
  303. bool result = true;
  304. switch(this->m_traits.syntax_type(*m_position))
  305. {
  306. case regex_constants::syntax_open_mark:
  307. return parse_open_paren();
  308. case regex_constants::syntax_close_mark:
  309. return false;
  310. case regex_constants::syntax_escape:
  311. return parse_extended_escape();
  312. case regex_constants::syntax_dot:
  313. return parse_match_any();
  314. case regex_constants::syntax_caret:
  315. ++m_position;
  316. this->append_state(
  317. (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_start : syntax_element_start_line));
  318. break;
  319. case regex_constants::syntax_dollar:
  320. ++m_position;
  321. this->append_state(
  322. (this->flags() & regex_constants::no_mod_m ? syntax_element_buffer_end : syntax_element_end_line));
  323. break;
  324. case regex_constants::syntax_star:
  325. if(m_position == this->m_base)
  326. {
  327. fail(regex_constants::error_badrepeat, 0, "The repeat operator \"*\" cannot start a regular expression.");
  328. return false;
  329. }
  330. ++m_position;
  331. return parse_repeat();
  332. case regex_constants::syntax_question:
  333. if(m_position == this->m_base)
  334. {
  335. fail(regex_constants::error_badrepeat, 0, "The repeat operator \"?\" cannot start a regular expression.");
  336. return false;
  337. }
  338. ++m_position;
  339. return parse_repeat(0,1);
  340. case regex_constants::syntax_plus:
  341. if(m_position == this->m_base)
  342. {
  343. fail(regex_constants::error_badrepeat, 0, "The repeat operator \"+\" cannot start a regular expression.");
  344. return false;
  345. }
  346. ++m_position;
  347. return parse_repeat(1);
  348. case regex_constants::syntax_open_brace:
  349. ++m_position;
  350. return parse_repeat_range(false);
  351. case regex_constants::syntax_close_brace:
  352. if((this->flags() & regbase::no_perl_ex) == regbase::no_perl_ex)
  353. {
  354. fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
  355. return false;
  356. }
  357. result = parse_literal();
  358. break;
  359. case regex_constants::syntax_or:
  360. return parse_alt();
  361. case regex_constants::syntax_open_set:
  362. return parse_set();
  363. case regex_constants::syntax_newline:
  364. if(this->flags() & regbase::newline_alt)
  365. return parse_alt();
  366. else
  367. return parse_literal();
  368. case regex_constants::syntax_hash:
  369. //
  370. // If we have a mod_x flag set, then skip until
  371. // we get to a newline character:
  372. //
  373. if((this->flags()
  374. & (regbase::no_perl_ex|regbase::mod_x))
  375. == regbase::mod_x)
  376. {
  377. while((m_position != m_end) && !is_separator(*m_position++)){}
  378. return true;
  379. }
  380. BOOST_FALLTHROUGH;
  381. default:
  382. result = parse_literal();
  383. break;
  384. }
  385. return result;
  386. }
  387. #ifdef BOOST_MSVC
  388. #pragma warning(pop)
  389. #endif
  390. template <class charT, class traits>
  391. bool basic_regex_parser<charT, traits>::parse_literal()
  392. {
  393. // append this as a literal provided it's not a space character
  394. // or the perl option regbase::mod_x is not set:
  395. if(
  396. ((this->flags()
  397. & (regbase::main_option_type|regbase::mod_x|regbase::no_perl_ex))
  398. != regbase::mod_x)
  399. || !this->m_traits.isctype(*m_position, this->m_mask_space))
  400. this->append_literal(*m_position);
  401. ++m_position;
  402. return true;
  403. }
  404. template <class charT, class traits>
  405. bool basic_regex_parser<charT, traits>::parse_open_paren()
  406. {
  407. //
  408. // skip the '(' and error check:
  409. //
  410. if(++m_position == m_end)
  411. {
  412. fail(regex_constants::error_paren, m_position - m_base);
  413. return false;
  414. }
  415. //
  416. // begin by checking for a perl-style (?...) extension:
  417. //
  418. if(
  419. ((this->flags() & (regbase::main_option_type | regbase::no_perl_ex)) == 0)
  420. || ((this->flags() & (regbase::main_option_type | regbase::emacs_ex)) == (regbase::basic_syntax_group|regbase::emacs_ex))
  421. )
  422. {
  423. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question)
  424. return parse_perl_extension();
  425. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_star)
  426. return parse_perl_verb();
  427. }
  428. //
  429. // update our mark count, and append the required state:
  430. //
  431. unsigned markid = 0;
  432. if(0 == (this->flags() & regbase::nosubs))
  433. {
  434. markid = ++m_mark_count;
  435. #ifndef BOOST_NO_STD_DISTANCE
  436. if(this->flags() & regbase::save_subexpression_location)
  437. this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 1, 0));
  438. #else
  439. if(this->flags() & regbase::save_subexpression_location)
  440. this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 1, 0));
  441. #endif
  442. }
  443. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
  444. pb->index = markid;
  445. pb->icase = this->flags() & regbase::icase;
  446. std::ptrdiff_t last_paren_start = this->getoffset(pb);
  447. // back up insertion point for alternations, and set new point:
  448. std::ptrdiff_t last_alt_point = m_alt_insert_point;
  449. this->m_pdata->m_data.align();
  450. m_alt_insert_point = this->m_pdata->m_data.size();
  451. //
  452. // back up the current flags in case we have a nested (?imsx) group:
  453. //
  454. regex_constants::syntax_option_type opts = this->flags();
  455. bool old_case_change = m_has_case_change;
  456. m_has_case_change = false; // no changes to this scope as yet...
  457. //
  458. // Back up branch reset data in case we have a nested (?|...)
  459. //
  460. int mark_reset = m_mark_reset;
  461. m_mark_reset = -1;
  462. //
  463. // now recursively add more states, this will terminate when we get to a
  464. // matching ')' :
  465. //
  466. parse_all();
  467. //
  468. // Unwind pushed alternatives:
  469. //
  470. if(0 == unwind_alts(last_paren_start))
  471. return false;
  472. //
  473. // restore flags:
  474. //
  475. if(m_has_case_change)
  476. {
  477. // the case has changed in one or more of the alternatives
  478. // within the scoped (...) block: we have to add a state
  479. // to reset the case sensitivity:
  480. static_cast<re_case*>(
  481. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  482. )->icase = opts & regbase::icase;
  483. }
  484. this->flags(opts);
  485. m_has_case_change = old_case_change;
  486. //
  487. // restore branch reset:
  488. //
  489. m_mark_reset = mark_reset;
  490. //
  491. // we either have a ')' or we have run out of characters prematurely:
  492. //
  493. if(m_position == m_end)
  494. {
  495. this->fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_end));
  496. return false;
  497. }
  498. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  499. return false;
  500. #ifndef BOOST_NO_STD_DISTANCE
  501. if(markid && (this->flags() & regbase::save_subexpression_location))
  502. this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position);
  503. #else
  504. if(markid && (this->flags() & regbase::save_subexpression_location))
  505. this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base);
  506. #endif
  507. ++m_position;
  508. //
  509. // append closing parenthesis state:
  510. //
  511. pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
  512. pb->index = markid;
  513. pb->icase = this->flags() & regbase::icase;
  514. this->m_paren_start = last_paren_start;
  515. //
  516. // restore the alternate insertion point:
  517. //
  518. this->m_alt_insert_point = last_alt_point;
  519. //
  520. // allow backrefs to this mark:
  521. //
  522. if((markid > 0) && (markid < sizeof(unsigned) * CHAR_BIT))
  523. this->m_backrefs |= 1u << (markid - 1);
  524. return true;
  525. }
  526. template <class charT, class traits>
  527. bool basic_regex_parser<charT, traits>::parse_basic_escape()
  528. {
  529. if(++m_position == m_end)
  530. {
  531. fail(regex_constants::error_paren, m_position - m_base);
  532. return false;
  533. }
  534. bool result = true;
  535. switch(this->m_traits.escape_syntax_type(*m_position))
  536. {
  537. case regex_constants::syntax_open_mark:
  538. return parse_open_paren();
  539. case regex_constants::syntax_close_mark:
  540. return false;
  541. case regex_constants::syntax_plus:
  542. if(this->flags() & regex_constants::bk_plus_qm)
  543. {
  544. ++m_position;
  545. return parse_repeat(1);
  546. }
  547. else
  548. return parse_literal();
  549. case regex_constants::syntax_question:
  550. if(this->flags() & regex_constants::bk_plus_qm)
  551. {
  552. ++m_position;
  553. return parse_repeat(0, 1);
  554. }
  555. else
  556. return parse_literal();
  557. case regex_constants::syntax_open_brace:
  558. if(this->flags() & regbase::no_intervals)
  559. return parse_literal();
  560. ++m_position;
  561. return parse_repeat_range(true);
  562. case regex_constants::syntax_close_brace:
  563. if(this->flags() & regbase::no_intervals)
  564. return parse_literal();
  565. fail(regex_constants::error_brace, this->m_position - this->m_base, "Found a closing repetition operator } with no corresponding {.");
  566. return false;
  567. case regex_constants::syntax_or:
  568. if(this->flags() & regbase::bk_vbar)
  569. return parse_alt();
  570. else
  571. result = parse_literal();
  572. break;
  573. case regex_constants::syntax_digit:
  574. return parse_backref();
  575. case regex_constants::escape_type_start_buffer:
  576. if(this->flags() & regbase::emacs_ex)
  577. {
  578. ++m_position;
  579. this->append_state(syntax_element_buffer_start);
  580. }
  581. else
  582. result = parse_literal();
  583. break;
  584. case regex_constants::escape_type_end_buffer:
  585. if(this->flags() & regbase::emacs_ex)
  586. {
  587. ++m_position;
  588. this->append_state(syntax_element_buffer_end);
  589. }
  590. else
  591. result = parse_literal();
  592. break;
  593. case regex_constants::escape_type_word_assert:
  594. if(this->flags() & regbase::emacs_ex)
  595. {
  596. ++m_position;
  597. this->append_state(syntax_element_word_boundary);
  598. }
  599. else
  600. result = parse_literal();
  601. break;
  602. case regex_constants::escape_type_not_word_assert:
  603. if(this->flags() & regbase::emacs_ex)
  604. {
  605. ++m_position;
  606. this->append_state(syntax_element_within_word);
  607. }
  608. else
  609. result = parse_literal();
  610. break;
  611. case regex_constants::escape_type_left_word:
  612. if(this->flags() & regbase::emacs_ex)
  613. {
  614. ++m_position;
  615. this->append_state(syntax_element_word_start);
  616. }
  617. else
  618. result = parse_literal();
  619. break;
  620. case regex_constants::escape_type_right_word:
  621. if(this->flags() & regbase::emacs_ex)
  622. {
  623. ++m_position;
  624. this->append_state(syntax_element_word_end);
  625. }
  626. else
  627. result = parse_literal();
  628. break;
  629. default:
  630. if(this->flags() & regbase::emacs_ex)
  631. {
  632. bool negate = true;
  633. switch(*m_position)
  634. {
  635. case 'w':
  636. negate = false;
  637. BOOST_FALLTHROUGH;
  638. case 'W':
  639. {
  640. basic_char_set<charT, traits> char_set;
  641. if(negate)
  642. char_set.negate();
  643. char_set.add_class(this->m_word_mask);
  644. if(0 == this->append_set(char_set))
  645. {
  646. fail(regex_constants::error_ctype, m_position - m_base);
  647. return false;
  648. }
  649. ++m_position;
  650. return true;
  651. }
  652. case 's':
  653. negate = false;
  654. BOOST_FALLTHROUGH;
  655. case 'S':
  656. return add_emacs_code(negate);
  657. case 'c':
  658. case 'C':
  659. // not supported yet:
  660. fail(regex_constants::error_escape, m_position - m_base, "The \\c and \\C escape sequences are not supported by POSIX basic regular expressions: try the Perl syntax instead.");
  661. return false;
  662. default:
  663. break;
  664. }
  665. }
  666. result = parse_literal();
  667. break;
  668. }
  669. return result;
  670. }
  671. template <class charT, class traits>
  672. bool basic_regex_parser<charT, traits>::parse_extended_escape()
  673. {
  674. ++m_position;
  675. if(m_position == m_end)
  676. {
  677. fail(regex_constants::error_escape, m_position - m_base, "Incomplete escape sequence found.");
  678. return false;
  679. }
  680. bool negate = false; // in case this is a character class escape: \w \d etc
  681. switch(this->m_traits.escape_syntax_type(*m_position))
  682. {
  683. case regex_constants::escape_type_not_class:
  684. negate = true;
  685. BOOST_FALLTHROUGH;
  686. case regex_constants::escape_type_class:
  687. {
  688. escape_type_class_jump:
  689. typedef typename traits::char_class_type m_type;
  690. m_type m = this->m_traits.lookup_classname(m_position, m_position+1);
  691. if(m != 0)
  692. {
  693. basic_char_set<charT, traits> char_set;
  694. if(negate)
  695. char_set.negate();
  696. char_set.add_class(m);
  697. if(0 == this->append_set(char_set))
  698. {
  699. fail(regex_constants::error_ctype, m_position - m_base);
  700. return false;
  701. }
  702. ++m_position;
  703. return true;
  704. }
  705. //
  706. // not a class, just a regular unknown escape:
  707. //
  708. this->append_literal(unescape_character());
  709. break;
  710. }
  711. case regex_constants::syntax_digit:
  712. return parse_backref();
  713. case regex_constants::escape_type_left_word:
  714. ++m_position;
  715. this->append_state(syntax_element_word_start);
  716. break;
  717. case regex_constants::escape_type_right_word:
  718. ++m_position;
  719. this->append_state(syntax_element_word_end);
  720. break;
  721. case regex_constants::escape_type_start_buffer:
  722. ++m_position;
  723. this->append_state(syntax_element_buffer_start);
  724. break;
  725. case regex_constants::escape_type_end_buffer:
  726. ++m_position;
  727. this->append_state(syntax_element_buffer_end);
  728. break;
  729. case regex_constants::escape_type_word_assert:
  730. ++m_position;
  731. this->append_state(syntax_element_word_boundary);
  732. break;
  733. case regex_constants::escape_type_not_word_assert:
  734. ++m_position;
  735. this->append_state(syntax_element_within_word);
  736. break;
  737. case regex_constants::escape_type_Z:
  738. ++m_position;
  739. this->append_state(syntax_element_soft_buffer_end);
  740. break;
  741. case regex_constants::escape_type_Q:
  742. return parse_QE();
  743. case regex_constants::escape_type_C:
  744. return parse_match_any();
  745. case regex_constants::escape_type_X:
  746. ++m_position;
  747. this->append_state(syntax_element_combining);
  748. break;
  749. case regex_constants::escape_type_G:
  750. ++m_position;
  751. this->append_state(syntax_element_restart_continue);
  752. break;
  753. case regex_constants::escape_type_not_property:
  754. negate = true;
  755. BOOST_FALLTHROUGH;
  756. case regex_constants::escape_type_property:
  757. {
  758. ++m_position;
  759. char_class_type m;
  760. if(m_position == m_end)
  761. {
  762. fail(regex_constants::error_escape, m_position - m_base, "Incomplete property escape found.");
  763. return false;
  764. }
  765. // maybe have \p{ddd}
  766. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
  767. {
  768. const charT* base = m_position;
  769. // skip forward until we find enclosing brace:
  770. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
  771. ++m_position;
  772. if(m_position == m_end)
  773. {
  774. fail(regex_constants::error_escape, m_position - m_base, "Closing } missing from property escape sequence.");
  775. return false;
  776. }
  777. m = this->m_traits.lookup_classname(++base, m_position++);
  778. }
  779. else
  780. {
  781. m = this->m_traits.lookup_classname(m_position, m_position+1);
  782. ++m_position;
  783. }
  784. if(m != 0)
  785. {
  786. basic_char_set<charT, traits> char_set;
  787. if(negate)
  788. char_set.negate();
  789. char_set.add_class(m);
  790. if(0 == this->append_set(char_set))
  791. {
  792. fail(regex_constants::error_ctype, m_position - m_base);
  793. return false;
  794. }
  795. return true;
  796. }
  797. fail(regex_constants::error_ctype, m_position - m_base, "Escape sequence was neither a valid property nor a valid character class name.");
  798. return false;
  799. }
  800. case regex_constants::escape_type_reset_start_mark:
  801. if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  802. {
  803. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
  804. pb->index = -5;
  805. pb->icase = this->flags() & regbase::icase;
  806. this->m_pdata->m_data.align();
  807. ++m_position;
  808. return true;
  809. }
  810. goto escape_type_class_jump;
  811. case regex_constants::escape_type_line_ending:
  812. if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  813. {
  814. const charT* e = get_escape_R_string<charT>();
  815. const charT* old_position = m_position;
  816. const charT* old_end = m_end;
  817. const charT* old_base = m_base;
  818. m_position = e;
  819. m_base = e;
  820. m_end = e + traits::length(e);
  821. bool r = parse_all();
  822. m_position = ++old_position;
  823. m_end = old_end;
  824. m_base = old_base;
  825. return r;
  826. }
  827. goto escape_type_class_jump;
  828. case regex_constants::escape_type_extended_backref:
  829. if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  830. {
  831. bool have_brace = false;
  832. bool negative = false;
  833. static const char* incomplete_message = "Incomplete \\g escape found.";
  834. if(++m_position == m_end)
  835. {
  836. fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
  837. return false;
  838. }
  839. // maybe have \g{ddd}
  840. regex_constants::syntax_type syn = this->m_traits.syntax_type(*m_position);
  841. regex_constants::syntax_type syn_end = 0;
  842. if((syn == regex_constants::syntax_open_brace)
  843. || (syn == regex_constants::escape_type_left_word)
  844. || (syn == regex_constants::escape_type_end_buffer))
  845. {
  846. if(++m_position == m_end)
  847. {
  848. fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
  849. return false;
  850. }
  851. have_brace = true;
  852. switch(syn)
  853. {
  854. case regex_constants::syntax_open_brace:
  855. syn_end = regex_constants::syntax_close_brace;
  856. break;
  857. case regex_constants::escape_type_left_word:
  858. syn_end = regex_constants::escape_type_right_word;
  859. break;
  860. default:
  861. syn_end = regex_constants::escape_type_end_buffer;
  862. break;
  863. }
  864. }
  865. negative = (*m_position == static_cast<charT>('-'));
  866. if((negative) && (++m_position == m_end))
  867. {
  868. fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
  869. return false;
  870. }
  871. const charT* pc = m_position;
  872. boost::intmax_t i = this->m_traits.toi(pc, m_end, 10);
  873. if((i < 0) && syn_end)
  874. {
  875. // Check for a named capture, get the leftmost one if there is more than one:
  876. const charT* base = m_position;
  877. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != syn_end))
  878. {
  879. ++m_position;
  880. }
  881. i = hash_value_from_capture_name(base, m_position);
  882. pc = m_position;
  883. }
  884. if(negative)
  885. i = 1 + m_mark_count - i;
  886. if(((i > 0) && (i < std::numeric_limits<unsigned>::digits) && (i - 1 < static_cast<boost::intmax_t>(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (i-1)))) || ((i > 10000) && (this->m_pdata->get_id(i) > 0) && (this->m_pdata->get_id(i)-1 < static_cast<boost::intmax_t>(sizeof(unsigned) * CHAR_BIT)) && (this->m_backrefs & (1u << (this->m_pdata->get_id(i)-1)))))
  887. {
  888. m_position = pc;
  889. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
  890. pb->index = i;
  891. pb->icase = this->flags() & regbase::icase;
  892. }
  893. else
  894. {
  895. fail(regex_constants::error_backref, m_position - m_base);
  896. return false;
  897. }
  898. m_position = pc;
  899. if(have_brace)
  900. {
  901. if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != syn_end))
  902. {
  903. fail(regex_constants::error_escape, m_position - m_base, incomplete_message);
  904. return false;
  905. }
  906. ++m_position;
  907. }
  908. return true;
  909. }
  910. goto escape_type_class_jump;
  911. case regex_constants::escape_type_control_v:
  912. if(0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  913. goto escape_type_class_jump;
  914. BOOST_FALLTHROUGH;
  915. default:
  916. this->append_literal(unescape_character());
  917. break;
  918. }
  919. return true;
  920. }
  921. template <class charT, class traits>
  922. bool basic_regex_parser<charT, traits>::parse_match_any()
  923. {
  924. //
  925. // we have a '.' that can match any character:
  926. //
  927. ++m_position;
  928. static_cast<re_dot*>(
  929. this->append_state(syntax_element_wild, sizeof(re_dot))
  930. )->mask = static_cast<unsigned char>(this->flags() & regbase::no_mod_s
  931. ? BOOST_REGEX_DETAIL_NS::force_not_newline
  932. : this->flags() & regbase::mod_s ?
  933. BOOST_REGEX_DETAIL_NS::force_newline : BOOST_REGEX_DETAIL_NS::dont_care);
  934. return true;
  935. }
  936. template <class charT, class traits>
  937. bool basic_regex_parser<charT, traits>::parse_repeat(std::size_t low, std::size_t high)
  938. {
  939. bool greedy = true;
  940. bool pocessive = false;
  941. std::size_t insert_point;
  942. //
  943. // when we get to here we may have a non-greedy ? mark still to come:
  944. //
  945. if((m_position != m_end)
  946. && (
  947. (0 == (this->flags() & (regbase::main_option_type | regbase::no_perl_ex)))
  948. || ((regbase::basic_syntax_group|regbase::emacs_ex) == (this->flags() & (regbase::main_option_type | regbase::emacs_ex)))
  949. )
  950. )
  951. {
  952. // OK we have a perl or emacs regex, check for a '?':
  953. if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
  954. {
  955. // whitespace skip:
  956. while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  957. ++m_position;
  958. }
  959. if((m_position != m_end) && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_question))
  960. {
  961. greedy = false;
  962. ++m_position;
  963. }
  964. // for perl regexes only check for pocessive ++ repeats.
  965. if((m_position != m_end)
  966. && (0 == (this->flags() & regbase::main_option_type))
  967. && (this->m_traits.syntax_type(*m_position) == regex_constants::syntax_plus))
  968. {
  969. pocessive = true;
  970. ++m_position;
  971. }
  972. }
  973. if(0 == this->m_last_state)
  974. {
  975. fail(regex_constants::error_badrepeat, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_position), "Nothing to repeat.");
  976. return false;
  977. }
  978. if(this->m_last_state->type == syntax_element_endmark)
  979. {
  980. // insert a repeat before the '(' matching the last ')':
  981. insert_point = this->m_paren_start;
  982. }
  983. else if((this->m_last_state->type == syntax_element_literal) && (static_cast<re_literal*>(this->m_last_state)->length > 1))
  984. {
  985. // the last state was a literal with more than one character, split it in two:
  986. re_literal* lit = static_cast<re_literal*>(this->m_last_state);
  987. charT c = (static_cast<charT*>(static_cast<void*>(lit+1)))[lit->length - 1];
  988. lit->length -= 1;
  989. // now append new state:
  990. lit = static_cast<re_literal*>(this->append_state(syntax_element_literal, sizeof(re_literal) + sizeof(charT)));
  991. lit->length = 1;
  992. (static_cast<charT*>(static_cast<void*>(lit+1)))[0] = c;
  993. insert_point = this->getoffset(this->m_last_state);
  994. }
  995. else
  996. {
  997. // repeat the last state whatever it was, need to add some error checking here:
  998. switch(this->m_last_state->type)
  999. {
  1000. case syntax_element_start_line:
  1001. case syntax_element_end_line:
  1002. case syntax_element_word_boundary:
  1003. case syntax_element_within_word:
  1004. case syntax_element_word_start:
  1005. case syntax_element_word_end:
  1006. case syntax_element_buffer_start:
  1007. case syntax_element_buffer_end:
  1008. case syntax_element_alt:
  1009. case syntax_element_soft_buffer_end:
  1010. case syntax_element_restart_continue:
  1011. case syntax_element_jump:
  1012. case syntax_element_startmark:
  1013. case syntax_element_backstep:
  1014. // can't legally repeat any of the above:
  1015. fail(regex_constants::error_badrepeat, m_position - m_base);
  1016. return false;
  1017. default:
  1018. // do nothing...
  1019. break;
  1020. }
  1021. insert_point = this->getoffset(this->m_last_state);
  1022. }
  1023. //
  1024. // OK we now know what to repeat, so insert the repeat around it:
  1025. //
  1026. re_repeat* rep = static_cast<re_repeat*>(this->insert_state(insert_point, syntax_element_rep, re_repeater_size));
  1027. rep->min = low;
  1028. rep->max = high;
  1029. rep->greedy = greedy;
  1030. rep->leading = false;
  1031. // store our repeater position for later:
  1032. std::ptrdiff_t rep_off = this->getoffset(rep);
  1033. // and append a back jump to the repeat:
  1034. re_jump* jmp = static_cast<re_jump*>(this->append_state(syntax_element_jump, sizeof(re_jump)));
  1035. jmp->alt.i = rep_off - this->getoffset(jmp);
  1036. this->m_pdata->m_data.align();
  1037. // now fill in the alt jump for the repeat:
  1038. rep = static_cast<re_repeat*>(this->getaddress(rep_off));
  1039. rep->alt.i = this->m_pdata->m_data.size() - rep_off;
  1040. //
  1041. // If the repeat is pocessive then bracket the repeat with a (?>...)
  1042. // independent sub-expression construct:
  1043. //
  1044. if(pocessive)
  1045. {
  1046. if(m_position != m_end)
  1047. {
  1048. //
  1049. // Check for illegal following quantifier, we have to do this here, because
  1050. // the extra states we insert below circumvents our usual error checking :-(
  1051. //
  1052. bool contin = false;
  1053. do
  1054. {
  1055. if ((this->flags() & (regbase::main_option_type | regbase::mod_x | regbase::no_perl_ex)) == regbase::mod_x)
  1056. {
  1057. // whitespace skip:
  1058. while ((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  1059. ++m_position;
  1060. }
  1061. if (m_position != m_end)
  1062. {
  1063. switch (this->m_traits.syntax_type(*m_position))
  1064. {
  1065. case regex_constants::syntax_star:
  1066. case regex_constants::syntax_plus:
  1067. case regex_constants::syntax_question:
  1068. case regex_constants::syntax_open_brace:
  1069. fail(regex_constants::error_badrepeat, m_position - m_base);
  1070. return false;
  1071. case regex_constants::syntax_open_mark:
  1072. // Do we have a comment? If so we need to skip it here...
  1073. if ((m_position + 2 < m_end) && this->m_traits.syntax_type(*(m_position + 1)) == regex_constants::syntax_question
  1074. && this->m_traits.syntax_type(*(m_position + 2)) == regex_constants::syntax_hash)
  1075. {
  1076. while ((m_position != m_end)
  1077. && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark)) {
  1078. }
  1079. contin = true;
  1080. }
  1081. else
  1082. contin = false;
  1083. }
  1084. }
  1085. else
  1086. contin = false;
  1087. } while (contin);
  1088. }
  1089. re_brace* pb = static_cast<re_brace*>(this->insert_state(insert_point, syntax_element_startmark, sizeof(re_brace)));
  1090. pb->index = -3;
  1091. pb->icase = this->flags() & regbase::icase;
  1092. jmp = static_cast<re_jump*>(this->insert_state(insert_point + sizeof(re_brace), syntax_element_jump, sizeof(re_jump)));
  1093. this->m_pdata->m_data.align();
  1094. jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
  1095. pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
  1096. pb->index = -3;
  1097. pb->icase = this->flags() & regbase::icase;
  1098. }
  1099. return true;
  1100. }
  1101. template <class charT, class traits>
  1102. bool basic_regex_parser<charT, traits>::parse_repeat_range(bool isbasic)
  1103. {
  1104. static const char* incomplete_message = "Missing } in quantified repetition.";
  1105. //
  1106. // parse a repeat-range:
  1107. //
  1108. std::size_t min, max;
  1109. boost::intmax_t v;
  1110. // skip whitespace:
  1111. while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  1112. ++m_position;
  1113. if(this->m_position == this->m_end)
  1114. {
  1115. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1116. {
  1117. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1118. return false;
  1119. }
  1120. // Treat the opening '{' as a literal character, rewind to start of error:
  1121. --m_position;
  1122. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1123. return parse_literal();
  1124. }
  1125. // get min:
  1126. v = this->m_traits.toi(m_position, m_end, 10);
  1127. // skip whitespace:
  1128. if((v < 0) || (v > umax()))
  1129. {
  1130. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1131. {
  1132. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1133. return false;
  1134. }
  1135. // Treat the opening '{' as a literal character, rewind to start of error:
  1136. --m_position;
  1137. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1138. return parse_literal();
  1139. }
  1140. while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  1141. ++m_position;
  1142. if(this->m_position == this->m_end)
  1143. {
  1144. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1145. {
  1146. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1147. return false;
  1148. }
  1149. // Treat the opening '{' as a literal character, rewind to start of error:
  1150. --m_position;
  1151. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1152. return parse_literal();
  1153. }
  1154. min = static_cast<std::size_t>(v);
  1155. // see if we have a comma:
  1156. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_comma)
  1157. {
  1158. // move on and error check:
  1159. ++m_position;
  1160. // skip whitespace:
  1161. while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  1162. ++m_position;
  1163. if(this->m_position == this->m_end)
  1164. {
  1165. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1166. {
  1167. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1168. return false;
  1169. }
  1170. // Treat the opening '{' as a literal character, rewind to start of error:
  1171. --m_position;
  1172. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1173. return parse_literal();
  1174. }
  1175. // get the value if any:
  1176. v = this->m_traits.toi(m_position, m_end, 10);
  1177. max = ((v >= 0) && (v < umax())) ? (std::size_t)v : (std::numeric_limits<std::size_t>::max)();
  1178. }
  1179. else
  1180. {
  1181. // no comma, max = min:
  1182. max = min;
  1183. }
  1184. // skip whitespace:
  1185. while((m_position != m_end) && this->m_traits.isctype(*m_position, this->m_mask_space))
  1186. ++m_position;
  1187. // OK now check trailing }:
  1188. if(this->m_position == this->m_end)
  1189. {
  1190. if(this->flags() & (regbase::main_option_type | regbase::no_perl_ex))
  1191. {
  1192. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1193. return false;
  1194. }
  1195. // Treat the opening '{' as a literal character, rewind to start of error:
  1196. --m_position;
  1197. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1198. return parse_literal();
  1199. }
  1200. if(isbasic)
  1201. {
  1202. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_escape)
  1203. {
  1204. ++m_position;
  1205. if(this->m_position == this->m_end)
  1206. {
  1207. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1208. return false;
  1209. }
  1210. }
  1211. else
  1212. {
  1213. fail(regex_constants::error_brace, this->m_position - this->m_base, incomplete_message);
  1214. return false;
  1215. }
  1216. }
  1217. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_brace)
  1218. ++m_position;
  1219. else
  1220. {
  1221. // Treat the opening '{' as a literal character, rewind to start of error:
  1222. --m_position;
  1223. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_brace) --m_position;
  1224. return parse_literal();
  1225. }
  1226. //
  1227. // finally go and add the repeat, unless error:
  1228. //
  1229. if(min > max)
  1230. {
  1231. // Backtrack to error location:
  1232. m_position -= 2;
  1233. while(this->m_traits.isctype(*m_position, this->m_word_mask)) --m_position;
  1234. ++m_position;
  1235. fail(regex_constants::error_badbrace, m_position - m_base);
  1236. return false;
  1237. }
  1238. return parse_repeat(min, max);
  1239. }
  1240. template <class charT, class traits>
  1241. bool basic_regex_parser<charT, traits>::parse_alt()
  1242. {
  1243. //
  1244. // error check: if there have been no previous states,
  1245. // or if the last state was a '(' then error:
  1246. //
  1247. if(
  1248. ((this->m_last_state == 0) || (this->m_last_state->type == syntax_element_startmark))
  1249. &&
  1250. !(
  1251. ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
  1252. &&
  1253. ((this->flags() & regbase::no_empty_expressions) == 0)
  1254. )
  1255. )
  1256. {
  1257. fail(regex_constants::error_empty, this->m_position - this->m_base, "A regular expression cannot start with the alternation operator |.");
  1258. return false;
  1259. }
  1260. //
  1261. // Reset mark count if required:
  1262. //
  1263. if(m_max_mark < m_mark_count)
  1264. m_max_mark = m_mark_count;
  1265. if(m_mark_reset >= 0)
  1266. m_mark_count = m_mark_reset;
  1267. ++m_position;
  1268. //
  1269. // we need to append a trailing jump:
  1270. //
  1271. re_syntax_base* pj = this->append_state(BOOST_REGEX_DETAIL_NS::syntax_element_jump, sizeof(re_jump));
  1272. std::ptrdiff_t jump_offset = this->getoffset(pj);
  1273. //
  1274. // now insert the alternative:
  1275. //
  1276. re_alt* palt = static_cast<re_alt*>(this->insert_state(this->m_alt_insert_point, syntax_element_alt, re_alt_size));
  1277. jump_offset += re_alt_size;
  1278. this->m_pdata->m_data.align();
  1279. palt->alt.i = this->m_pdata->m_data.size() - this->getoffset(palt);
  1280. //
  1281. // update m_alt_insert_point so that the next alternate gets
  1282. // inserted at the start of the second of the two we've just created:
  1283. //
  1284. this->m_alt_insert_point = this->m_pdata->m_data.size();
  1285. //
  1286. // the start of this alternative must have a case changes state
  1287. // if the current block has messed around with case changes:
  1288. //
  1289. if(m_has_case_change)
  1290. {
  1291. static_cast<re_case*>(
  1292. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  1293. )->icase = this->m_icase;
  1294. }
  1295. //
  1296. // push the alternative onto our stack, a recursive
  1297. // implementation here is easier to understand (and faster
  1298. // as it happens), but causes all kinds of stack overflow problems
  1299. // on programs with small stacks (COM+).
  1300. //
  1301. m_alt_jumps.push_back(jump_offset);
  1302. return true;
  1303. }
  1304. template <class charT, class traits>
  1305. bool basic_regex_parser<charT, traits>::parse_set()
  1306. {
  1307. static const char* incomplete_message = "Character set declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
  1308. ++m_position;
  1309. if(m_position == m_end)
  1310. {
  1311. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1312. return false;
  1313. }
  1314. basic_char_set<charT, traits> char_set;
  1315. const charT* base = m_position; // where the '[' was
  1316. const charT* item_base = m_position; // where the '[' or '^' was
  1317. while(m_position != m_end)
  1318. {
  1319. switch(this->m_traits.syntax_type(*m_position))
  1320. {
  1321. case regex_constants::syntax_caret:
  1322. if(m_position == base)
  1323. {
  1324. char_set.negate();
  1325. ++m_position;
  1326. item_base = m_position;
  1327. }
  1328. else
  1329. parse_set_literal(char_set);
  1330. break;
  1331. case regex_constants::syntax_close_set:
  1332. if(m_position == item_base)
  1333. {
  1334. parse_set_literal(char_set);
  1335. break;
  1336. }
  1337. else
  1338. {
  1339. ++m_position;
  1340. if(0 == this->append_set(char_set))
  1341. {
  1342. fail(regex_constants::error_ctype, m_position - m_base);
  1343. return false;
  1344. }
  1345. }
  1346. return true;
  1347. case regex_constants::syntax_open_set:
  1348. if(parse_inner_set(char_set))
  1349. break;
  1350. return true;
  1351. case regex_constants::syntax_escape:
  1352. {
  1353. //
  1354. // look ahead and see if this is a character class shortcut
  1355. // \d \w \s etc...
  1356. //
  1357. ++m_position;
  1358. if(this->m_traits.escape_syntax_type(*m_position)
  1359. == regex_constants::escape_type_class)
  1360. {
  1361. char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
  1362. if(m != 0)
  1363. {
  1364. char_set.add_class(m);
  1365. ++m_position;
  1366. break;
  1367. }
  1368. }
  1369. else if(this->m_traits.escape_syntax_type(*m_position)
  1370. == regex_constants::escape_type_not_class)
  1371. {
  1372. // negated character class:
  1373. char_class_type m = this->m_traits.lookup_classname(m_position, m_position+1);
  1374. if(m != 0)
  1375. {
  1376. char_set.add_negated_class(m);
  1377. ++m_position;
  1378. break;
  1379. }
  1380. }
  1381. // not a character class, just a regular escape:
  1382. --m_position;
  1383. parse_set_literal(char_set);
  1384. break;
  1385. }
  1386. default:
  1387. parse_set_literal(char_set);
  1388. break;
  1389. }
  1390. }
  1391. return m_position != m_end;
  1392. }
  1393. template <class charT, class traits>
  1394. bool basic_regex_parser<charT, traits>::parse_inner_set(basic_char_set<charT, traits>& char_set)
  1395. {
  1396. static const char* incomplete_message = "Character class declaration starting with [ terminated prematurely - either no ] was found or the set had no content.";
  1397. //
  1398. // we have either a character class [:name:]
  1399. // a collating element [.name.]
  1400. // or an equivalence class [=name=]
  1401. //
  1402. if(m_end == ++m_position)
  1403. {
  1404. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1405. return false;
  1406. }
  1407. switch(this->m_traits.syntax_type(*m_position))
  1408. {
  1409. case regex_constants::syntax_dot:
  1410. //
  1411. // a collating element is treated as a literal:
  1412. //
  1413. --m_position;
  1414. parse_set_literal(char_set);
  1415. return true;
  1416. case regex_constants::syntax_colon:
  1417. {
  1418. // check that character classes are actually enabled:
  1419. if((this->flags() & (regbase::main_option_type | regbase::no_char_classes))
  1420. == (regbase::basic_syntax_group | regbase::no_char_classes))
  1421. {
  1422. --m_position;
  1423. parse_set_literal(char_set);
  1424. return true;
  1425. }
  1426. // skip the ':'
  1427. if(m_end == ++m_position)
  1428. {
  1429. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1430. return false;
  1431. }
  1432. const charT* name_first = m_position;
  1433. // skip at least one character, then find the matching ':]'
  1434. if(m_end == ++m_position)
  1435. {
  1436. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1437. return false;
  1438. }
  1439. while((m_position != m_end)
  1440. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_colon))
  1441. ++m_position;
  1442. const charT* name_last = m_position;
  1443. if(m_end == m_position)
  1444. {
  1445. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1446. return false;
  1447. }
  1448. if((m_end == ++m_position)
  1449. || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
  1450. {
  1451. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1452. return false;
  1453. }
  1454. //
  1455. // check for negated class:
  1456. //
  1457. bool negated = false;
  1458. if(this->m_traits.syntax_type(*name_first) == regex_constants::syntax_caret)
  1459. {
  1460. ++name_first;
  1461. negated = true;
  1462. }
  1463. typedef typename traits::char_class_type m_type;
  1464. m_type m = this->m_traits.lookup_classname(name_first, name_last);
  1465. if(m == 0)
  1466. {
  1467. if(char_set.empty() && (name_last - name_first == 1))
  1468. {
  1469. // maybe a special case:
  1470. ++m_position;
  1471. if( (m_position != m_end)
  1472. && (this->m_traits.syntax_type(*m_position)
  1473. == regex_constants::syntax_close_set))
  1474. {
  1475. if(this->m_traits.escape_syntax_type(*name_first)
  1476. == regex_constants::escape_type_left_word)
  1477. {
  1478. ++m_position;
  1479. this->append_state(syntax_element_word_start);
  1480. return false;
  1481. }
  1482. if(this->m_traits.escape_syntax_type(*name_first)
  1483. == regex_constants::escape_type_right_word)
  1484. {
  1485. ++m_position;
  1486. this->append_state(syntax_element_word_end);
  1487. return false;
  1488. }
  1489. }
  1490. }
  1491. fail(regex_constants::error_ctype, name_first - m_base);
  1492. return false;
  1493. }
  1494. if(negated == false)
  1495. char_set.add_class(m);
  1496. else
  1497. char_set.add_negated_class(m);
  1498. ++m_position;
  1499. break;
  1500. }
  1501. case regex_constants::syntax_equal:
  1502. {
  1503. // skip the '='
  1504. if(m_end == ++m_position)
  1505. {
  1506. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1507. return false;
  1508. }
  1509. const charT* name_first = m_position;
  1510. // skip at least one character, then find the matching '=]'
  1511. if(m_end == ++m_position)
  1512. {
  1513. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1514. return false;
  1515. }
  1516. while((m_position != m_end)
  1517. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal))
  1518. ++m_position;
  1519. const charT* name_last = m_position;
  1520. if(m_end == m_position)
  1521. {
  1522. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1523. return false;
  1524. }
  1525. if((m_end == ++m_position)
  1526. || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
  1527. {
  1528. fail(regex_constants::error_brack, m_position - m_base, incomplete_message);
  1529. return false;
  1530. }
  1531. string_type m = this->m_traits.lookup_collatename(name_first, name_last);
  1532. if((0 == m.size()) || (m.size() > 2))
  1533. {
  1534. fail(regex_constants::error_collate, name_first - m_base);
  1535. return false;
  1536. }
  1537. digraph<charT> d;
  1538. d.first = m[0];
  1539. if(m.size() > 1)
  1540. d.second = m[1];
  1541. else
  1542. d.second = 0;
  1543. char_set.add_equivalent(d);
  1544. ++m_position;
  1545. break;
  1546. }
  1547. default:
  1548. --m_position;
  1549. parse_set_literal(char_set);
  1550. break;
  1551. }
  1552. return true;
  1553. }
  1554. template <class charT, class traits>
  1555. void basic_regex_parser<charT, traits>::parse_set_literal(basic_char_set<charT, traits>& char_set)
  1556. {
  1557. digraph<charT> start_range(get_next_set_literal(char_set));
  1558. if(m_end == m_position)
  1559. {
  1560. fail(regex_constants::error_brack, m_position - m_base);
  1561. return;
  1562. }
  1563. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
  1564. {
  1565. // we have a range:
  1566. if(m_end == ++m_position)
  1567. {
  1568. fail(regex_constants::error_brack, m_position - m_base);
  1569. return;
  1570. }
  1571. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set)
  1572. {
  1573. digraph<charT> end_range = get_next_set_literal(char_set);
  1574. char_set.add_range(start_range, end_range);
  1575. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_dash)
  1576. {
  1577. if(m_end == ++m_position)
  1578. {
  1579. fail(regex_constants::error_brack, m_position - m_base);
  1580. return;
  1581. }
  1582. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_set)
  1583. {
  1584. // trailing - :
  1585. --m_position;
  1586. return;
  1587. }
  1588. fail(regex_constants::error_range, m_position - m_base);
  1589. return;
  1590. }
  1591. return;
  1592. }
  1593. --m_position;
  1594. }
  1595. char_set.add_single(start_range);
  1596. }
  1597. template <class charT, class traits>
  1598. digraph<charT> basic_regex_parser<charT, traits>::get_next_set_literal(basic_char_set<charT, traits>& char_set)
  1599. {
  1600. digraph<charT> result;
  1601. switch(this->m_traits.syntax_type(*m_position))
  1602. {
  1603. case regex_constants::syntax_dash:
  1604. if(!char_set.empty())
  1605. {
  1606. // see if we are at the end of the set:
  1607. if((++m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
  1608. {
  1609. fail(regex_constants::error_range, m_position - m_base);
  1610. return result;
  1611. }
  1612. --m_position;
  1613. }
  1614. result.first = *m_position++;
  1615. return result;
  1616. case regex_constants::syntax_escape:
  1617. // check to see if escapes are supported first:
  1618. if(this->flags() & regex_constants::no_escape_in_lists)
  1619. {
  1620. result = *m_position++;
  1621. break;
  1622. }
  1623. ++m_position;
  1624. result = unescape_character();
  1625. break;
  1626. case regex_constants::syntax_open_set:
  1627. {
  1628. if(m_end == ++m_position)
  1629. {
  1630. fail(regex_constants::error_collate, m_position - m_base);
  1631. return result;
  1632. }
  1633. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot)
  1634. {
  1635. --m_position;
  1636. result.first = *m_position;
  1637. ++m_position;
  1638. return result;
  1639. }
  1640. if(m_end == ++m_position)
  1641. {
  1642. fail(regex_constants::error_collate, m_position - m_base);
  1643. return result;
  1644. }
  1645. const charT* name_first = m_position;
  1646. // skip at least one character, then find the matching ':]'
  1647. if(m_end == ++m_position)
  1648. {
  1649. fail(regex_constants::error_collate, name_first - m_base);
  1650. return result;
  1651. }
  1652. while((m_position != m_end)
  1653. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_dot))
  1654. ++m_position;
  1655. const charT* name_last = m_position;
  1656. if(m_end == m_position)
  1657. {
  1658. fail(regex_constants::error_collate, name_first - m_base);
  1659. return result;
  1660. }
  1661. if((m_end == ++m_position)
  1662. || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_set))
  1663. {
  1664. fail(regex_constants::error_collate, name_first - m_base);
  1665. return result;
  1666. }
  1667. ++m_position;
  1668. string_type s = this->m_traits.lookup_collatename(name_first, name_last);
  1669. if(s.empty() || (s.size() > 2))
  1670. {
  1671. fail(regex_constants::error_collate, name_first - m_base);
  1672. return result;
  1673. }
  1674. result.first = s[0];
  1675. if(s.size() > 1)
  1676. result.second = s[1];
  1677. else
  1678. result.second = 0;
  1679. return result;
  1680. }
  1681. default:
  1682. result = *m_position++;
  1683. }
  1684. return result;
  1685. }
  1686. //
  1687. // does a value fit in the specified charT type?
  1688. //
  1689. template <class charT>
  1690. bool valid_value(charT, boost::intmax_t v, const mpl::true_&)
  1691. {
  1692. return (v >> (sizeof(charT) * CHAR_BIT)) == 0;
  1693. }
  1694. template <class charT>
  1695. bool valid_value(charT, boost::intmax_t, const mpl::false_&)
  1696. {
  1697. return true; // v will alsways fit in a charT
  1698. }
  1699. template <class charT>
  1700. bool valid_value(charT c, boost::intmax_t v)
  1701. {
  1702. return valid_value(c, v, mpl::bool_<(sizeof(charT) < sizeof(boost::intmax_t))>());
  1703. }
  1704. template <class charT, class traits>
  1705. charT basic_regex_parser<charT, traits>::unescape_character()
  1706. {
  1707. #ifdef BOOST_MSVC
  1708. #pragma warning(push)
  1709. #pragma warning(disable:4127)
  1710. #endif
  1711. charT result(0);
  1712. if(m_position == m_end)
  1713. {
  1714. fail(regex_constants::error_escape, m_position - m_base, "Escape sequence terminated prematurely.");
  1715. return false;
  1716. }
  1717. switch(this->m_traits.escape_syntax_type(*m_position))
  1718. {
  1719. case regex_constants::escape_type_control_a:
  1720. result = charT('\a');
  1721. break;
  1722. case regex_constants::escape_type_e:
  1723. result = charT(27);
  1724. break;
  1725. case regex_constants::escape_type_control_f:
  1726. result = charT('\f');
  1727. break;
  1728. case regex_constants::escape_type_control_n:
  1729. result = charT('\n');
  1730. break;
  1731. case regex_constants::escape_type_control_r:
  1732. result = charT('\r');
  1733. break;
  1734. case regex_constants::escape_type_control_t:
  1735. result = charT('\t');
  1736. break;
  1737. case regex_constants::escape_type_control_v:
  1738. result = charT('\v');
  1739. break;
  1740. case regex_constants::escape_type_word_assert:
  1741. result = charT('\b');
  1742. break;
  1743. case regex_constants::escape_type_ascii_control:
  1744. ++m_position;
  1745. if(m_position == m_end)
  1746. {
  1747. // Rewind to start of escape:
  1748. --m_position;
  1749. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1750. fail(regex_constants::error_escape, m_position - m_base, "ASCII escape sequence terminated prematurely.");
  1751. return result;
  1752. }
  1753. result = static_cast<charT>(*m_position % 32);
  1754. break;
  1755. case regex_constants::escape_type_hex:
  1756. ++m_position;
  1757. if(m_position == m_end)
  1758. {
  1759. // Rewind to start of escape:
  1760. --m_position;
  1761. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1762. fail(regex_constants::error_escape, m_position - m_base, "Hexadecimal escape sequence terminated prematurely.");
  1763. return result;
  1764. }
  1765. // maybe have \x{ddd}
  1766. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
  1767. {
  1768. ++m_position;
  1769. if(m_position == m_end)
  1770. {
  1771. // Rewind to start of escape:
  1772. --m_position;
  1773. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1774. fail(regex_constants::error_escape, m_position - m_base, "Missing } in hexadecimal escape sequence.");
  1775. return result;
  1776. }
  1777. boost::intmax_t i = this->m_traits.toi(m_position, m_end, 16);
  1778. if((m_position == m_end)
  1779. || (i < 0)
  1780. || ((std::numeric_limits<charT>::is_specialized) && (i > (boost::intmax_t)(std::numeric_limits<charT>::max)()))
  1781. || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
  1782. {
  1783. // Rewind to start of escape:
  1784. --m_position;
  1785. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1786. fail(regex_constants::error_badbrace, m_position - m_base, "Hexadecimal escape sequence was invalid.");
  1787. return result;
  1788. }
  1789. ++m_position;
  1790. result = charT(i);
  1791. }
  1792. else
  1793. {
  1794. std::ptrdiff_t len = (std::min)(static_cast<std::ptrdiff_t>(2), static_cast<std::ptrdiff_t>(m_end - m_position));
  1795. boost::intmax_t i = this->m_traits.toi(m_position, m_position + len, 16);
  1796. if((i < 0)
  1797. || !valid_value(charT(0), i))
  1798. {
  1799. // Rewind to start of escape:
  1800. --m_position;
  1801. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1802. fail(regex_constants::error_escape, m_position - m_base, "Escape sequence did not encode a valid character.");
  1803. return result;
  1804. }
  1805. result = charT(i);
  1806. }
  1807. return result;
  1808. case regex_constants::syntax_digit:
  1809. {
  1810. // an octal escape sequence, the first character must be a zero
  1811. // followed by up to 3 octal digits:
  1812. std::ptrdiff_t len = (std::min)(::boost::BOOST_REGEX_DETAIL_NS::distance(m_position, m_end), static_cast<std::ptrdiff_t>(4));
  1813. const charT* bp = m_position;
  1814. boost::intmax_t val = this->m_traits.toi(bp, bp + 1, 8);
  1815. if(val != 0)
  1816. {
  1817. // Rewind to start of escape:
  1818. --m_position;
  1819. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1820. // Oops not an octal escape after all:
  1821. fail(regex_constants::error_escape, m_position - m_base, "Invalid octal escape sequence.");
  1822. return result;
  1823. }
  1824. val = this->m_traits.toi(m_position, m_position + len, 8);
  1825. if((val < 0) || (val > (boost::intmax_t)(std::numeric_limits<charT>::max)()))
  1826. {
  1827. // Rewind to start of escape:
  1828. --m_position;
  1829. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1830. fail(regex_constants::error_escape, m_position - m_base, "Octal escape sequence is invalid.");
  1831. return result;
  1832. }
  1833. return static_cast<charT>(val);
  1834. }
  1835. case regex_constants::escape_type_named_char:
  1836. {
  1837. ++m_position;
  1838. if(m_position == m_end)
  1839. {
  1840. // Rewind to start of escape:
  1841. --m_position;
  1842. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1843. fail(regex_constants::error_escape, m_position - m_base);
  1844. return false;
  1845. }
  1846. // maybe have \N{name}
  1847. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_open_brace)
  1848. {
  1849. const charT* base = m_position;
  1850. // skip forward until we find enclosing brace:
  1851. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_brace))
  1852. ++m_position;
  1853. if(m_position == m_end)
  1854. {
  1855. // Rewind to start of escape:
  1856. --m_position;
  1857. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1858. fail(regex_constants::error_escape, m_position - m_base);
  1859. return false;
  1860. }
  1861. string_type s = this->m_traits.lookup_collatename(++base, m_position++);
  1862. if(s.empty())
  1863. {
  1864. // Rewind to start of escape:
  1865. --m_position;
  1866. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1867. fail(regex_constants::error_collate, m_position - m_base);
  1868. return false;
  1869. }
  1870. if(s.size() == 1)
  1871. {
  1872. return s[0];
  1873. }
  1874. }
  1875. // fall through is a failure:
  1876. // Rewind to start of escape:
  1877. --m_position;
  1878. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1879. fail(regex_constants::error_escape, m_position - m_base);
  1880. return false;
  1881. }
  1882. default:
  1883. result = *m_position;
  1884. break;
  1885. }
  1886. ++m_position;
  1887. return result;
  1888. #ifdef BOOST_MSVC
  1889. #pragma warning(pop)
  1890. #endif
  1891. }
  1892. template <class charT, class traits>
  1893. bool basic_regex_parser<charT, traits>::parse_backref()
  1894. {
  1895. BOOST_ASSERT(m_position != m_end);
  1896. const charT* pc = m_position;
  1897. boost::intmax_t i = this->m_traits.toi(pc, pc + 1, 10);
  1898. if((i == 0) || (((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group) && (this->flags() & regbase::no_bk_refs)))
  1899. {
  1900. // not a backref at all but an octal escape sequence:
  1901. charT c = unescape_character();
  1902. this->append_literal(c);
  1903. }
  1904. else if((i > 0) && (this->m_backrefs & (1u << (i-1))))
  1905. {
  1906. m_position = pc;
  1907. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_backref, sizeof(re_brace)));
  1908. pb->index = i;
  1909. pb->icase = this->flags() & regbase::icase;
  1910. }
  1911. else
  1912. {
  1913. // Rewind to start of escape:
  1914. --m_position;
  1915. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  1916. fail(regex_constants::error_backref, m_position - m_base);
  1917. return false;
  1918. }
  1919. return true;
  1920. }
  1921. template <class charT, class traits>
  1922. bool basic_regex_parser<charT, traits>::parse_QE()
  1923. {
  1924. #ifdef BOOST_MSVC
  1925. #pragma warning(push)
  1926. #pragma warning(disable:4127)
  1927. #endif
  1928. //
  1929. // parse a \Q...\E sequence:
  1930. //
  1931. ++m_position; // skip the Q
  1932. const charT* start = m_position;
  1933. const charT* end;
  1934. do
  1935. {
  1936. while((m_position != m_end)
  1937. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape))
  1938. ++m_position;
  1939. if(m_position == m_end)
  1940. {
  1941. // a \Q...\E sequence may terminate with the end of the expression:
  1942. end = m_position;
  1943. break;
  1944. }
  1945. if(++m_position == m_end) // skip the escape
  1946. {
  1947. fail(regex_constants::error_escape, m_position - m_base, "Unterminated \\Q...\\E sequence.");
  1948. return false;
  1949. }
  1950. // check to see if it's a \E:
  1951. if(this->m_traits.escape_syntax_type(*m_position) == regex_constants::escape_type_E)
  1952. {
  1953. ++m_position;
  1954. end = m_position - 2;
  1955. break;
  1956. }
  1957. // otherwise go round again:
  1958. }while(true);
  1959. //
  1960. // now add all the character between the two escapes as literals:
  1961. //
  1962. while(start != end)
  1963. {
  1964. this->append_literal(*start);
  1965. ++start;
  1966. }
  1967. return true;
  1968. #ifdef BOOST_MSVC
  1969. #pragma warning(pop)
  1970. #endif
  1971. }
  1972. template <class charT, class traits>
  1973. bool basic_regex_parser<charT, traits>::parse_perl_extension()
  1974. {
  1975. if(++m_position == m_end)
  1976. {
  1977. // Rewind to start of (? sequence:
  1978. --m_position;
  1979. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  1980. fail(regex_constants::error_perl_extension, m_position - m_base);
  1981. return false;
  1982. }
  1983. //
  1984. // treat comments as a special case, as these
  1985. // are the only ones that don't start with a leading
  1986. // startmark state:
  1987. //
  1988. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_hash)
  1989. {
  1990. while((m_position != m_end)
  1991. && (this->m_traits.syntax_type(*m_position++) != regex_constants::syntax_close_mark))
  1992. {}
  1993. return true;
  1994. }
  1995. //
  1996. // backup some state, and prepare the way:
  1997. //
  1998. int markid = 0;
  1999. std::ptrdiff_t jump_offset = 0;
  2000. re_brace* pb = static_cast<re_brace*>(this->append_state(syntax_element_startmark, sizeof(re_brace)));
  2001. pb->icase = this->flags() & regbase::icase;
  2002. std::ptrdiff_t last_paren_start = this->getoffset(pb);
  2003. // back up insertion point for alternations, and set new point:
  2004. std::ptrdiff_t last_alt_point = m_alt_insert_point;
  2005. this->m_pdata->m_data.align();
  2006. m_alt_insert_point = this->m_pdata->m_data.size();
  2007. std::ptrdiff_t expected_alt_point = m_alt_insert_point;
  2008. bool restore_flags = true;
  2009. regex_constants::syntax_option_type old_flags = this->flags();
  2010. bool old_case_change = m_has_case_change;
  2011. m_has_case_change = false;
  2012. charT name_delim;
  2013. int mark_reset = m_mark_reset;
  2014. int max_mark = m_max_mark;
  2015. m_mark_reset = -1;
  2016. m_max_mark = m_mark_count;
  2017. boost::intmax_t v;
  2018. //
  2019. // select the actual extension used:
  2020. //
  2021. switch(this->m_traits.syntax_type(*m_position))
  2022. {
  2023. case regex_constants::syntax_or:
  2024. m_mark_reset = m_mark_count;
  2025. BOOST_FALLTHROUGH;
  2026. case regex_constants::syntax_colon:
  2027. //
  2028. // a non-capturing mark:
  2029. //
  2030. pb->index = markid = 0;
  2031. ++m_position;
  2032. break;
  2033. case regex_constants::syntax_digit:
  2034. {
  2035. //
  2036. // a recursive subexpression:
  2037. //
  2038. v = this->m_traits.toi(m_position, m_end, 10);
  2039. if((v < 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2040. {
  2041. // Rewind to start of (? sequence:
  2042. --m_position;
  2043. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2044. fail(regex_constants::error_perl_extension, m_position - m_base, "The recursive sub-expression refers to an invalid marking group, or is unterminated.");
  2045. return false;
  2046. }
  2047. insert_recursion:
  2048. pb->index = markid = 0;
  2049. re_recurse* pr = static_cast<re_recurse*>(this->append_state(syntax_element_recurse, sizeof(re_recurse)));
  2050. pr->alt.i = v;
  2051. pr->state_id = 0;
  2052. static_cast<re_case*>(
  2053. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  2054. )->icase = this->flags() & regbase::icase;
  2055. break;
  2056. }
  2057. case regex_constants::syntax_plus:
  2058. //
  2059. // A forward-relative recursive subexpression:
  2060. //
  2061. ++m_position;
  2062. v = this->m_traits.toi(m_position, m_end, 10);
  2063. if((v <= 0) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2064. {
  2065. // Rewind to start of (? sequence:
  2066. --m_position;
  2067. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2068. fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
  2069. return false;
  2070. }
  2071. if ((std::numeric_limits<boost::intmax_t>::max)() - m_mark_count < v)
  2072. {
  2073. fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
  2074. return false;
  2075. }
  2076. v += m_mark_count;
  2077. goto insert_recursion;
  2078. case regex_constants::syntax_dash:
  2079. //
  2080. // Possibly a backward-relative recursive subexpression:
  2081. //
  2082. ++m_position;
  2083. v = this->m_traits.toi(m_position, m_end, 10);
  2084. if(v <= 0)
  2085. {
  2086. --m_position;
  2087. // Oops not a relative recursion at all, but a (?-imsx) group:
  2088. goto option_group_jump;
  2089. }
  2090. v = m_mark_count + 1 - v;
  2091. if(v <= 0)
  2092. {
  2093. // Rewind to start of (? sequence:
  2094. --m_position;
  2095. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2096. fail(regex_constants::error_perl_extension, m_position - m_base, "An invalid or unterminated recursive sub-expression.");
  2097. return false;
  2098. }
  2099. goto insert_recursion;
  2100. case regex_constants::syntax_equal:
  2101. pb->index = markid = -1;
  2102. ++m_position;
  2103. jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
  2104. this->m_pdata->m_data.align();
  2105. m_alt_insert_point = this->m_pdata->m_data.size();
  2106. break;
  2107. case regex_constants::syntax_not:
  2108. pb->index = markid = -2;
  2109. ++m_position;
  2110. jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
  2111. this->m_pdata->m_data.align();
  2112. m_alt_insert_point = this->m_pdata->m_data.size();
  2113. break;
  2114. case regex_constants::escape_type_left_word:
  2115. {
  2116. // a lookbehind assertion:
  2117. if(++m_position == m_end)
  2118. {
  2119. // Rewind to start of (? sequence:
  2120. --m_position;
  2121. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2122. fail(regex_constants::error_perl_extension, m_position - m_base);
  2123. return false;
  2124. }
  2125. regex_constants::syntax_type t = this->m_traits.syntax_type(*m_position);
  2126. if(t == regex_constants::syntax_not)
  2127. pb->index = markid = -2;
  2128. else if(t == regex_constants::syntax_equal)
  2129. pb->index = markid = -1;
  2130. else
  2131. {
  2132. // Probably a named capture which also starts (?< :
  2133. name_delim = '>';
  2134. --m_position;
  2135. goto named_capture_jump;
  2136. }
  2137. ++m_position;
  2138. jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
  2139. this->append_state(syntax_element_backstep, sizeof(re_brace));
  2140. this->m_pdata->m_data.align();
  2141. m_alt_insert_point = this->m_pdata->m_data.size();
  2142. break;
  2143. }
  2144. case regex_constants::escape_type_right_word:
  2145. //
  2146. // an independent sub-expression:
  2147. //
  2148. pb->index = markid = -3;
  2149. ++m_position;
  2150. jump_offset = this->getoffset(this->append_state(syntax_element_jump, sizeof(re_jump)));
  2151. this->m_pdata->m_data.align();
  2152. m_alt_insert_point = this->m_pdata->m_data.size();
  2153. break;
  2154. case regex_constants::syntax_open_mark:
  2155. {
  2156. // a conditional expression:
  2157. pb->index = markid = -4;
  2158. if(++m_position == m_end)
  2159. {
  2160. // Rewind to start of (? sequence:
  2161. --m_position;
  2162. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2163. fail(regex_constants::error_perl_extension, m_position - m_base);
  2164. return false;
  2165. }
  2166. v = this->m_traits.toi(m_position, m_end, 10);
  2167. if(m_position == m_end)
  2168. {
  2169. // Rewind to start of (? sequence:
  2170. --m_position;
  2171. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2172. fail(regex_constants::error_perl_extension, m_position - m_base);
  2173. return false;
  2174. }
  2175. if(*m_position == charT('R'))
  2176. {
  2177. if(++m_position == m_end)
  2178. {
  2179. // Rewind to start of (? sequence:
  2180. --m_position;
  2181. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2182. fail(regex_constants::error_perl_extension, m_position - m_base);
  2183. return false;
  2184. }
  2185. if(*m_position == charT('&'))
  2186. {
  2187. const charT* base = ++m_position;
  2188. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2189. ++m_position;
  2190. if(m_position == m_end)
  2191. {
  2192. // Rewind to start of (? sequence:
  2193. --m_position;
  2194. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2195. fail(regex_constants::error_perl_extension, m_position - m_base);
  2196. return false;
  2197. }
  2198. v = -static_cast<int>(hash_value_from_capture_name(base, m_position));
  2199. }
  2200. else
  2201. {
  2202. v = -this->m_traits.toi(m_position, m_end, 10);
  2203. }
  2204. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
  2205. br->index = v < 0 ? (v - 1) : 0;
  2206. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2207. {
  2208. // Rewind to start of (? sequence:
  2209. --m_position;
  2210. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2211. fail(regex_constants::error_perl_extension, m_position - m_base);
  2212. return false;
  2213. }
  2214. if(++m_position == m_end)
  2215. {
  2216. // Rewind to start of (? sequence:
  2217. --m_position;
  2218. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2219. fail(regex_constants::error_perl_extension, m_position - m_base);
  2220. return false;
  2221. }
  2222. }
  2223. else if((*m_position == charT('\'')) || (*m_position == charT('<')))
  2224. {
  2225. const charT* base = ++m_position;
  2226. while((m_position != m_end) && (*m_position != charT('>')) && (*m_position != charT('\'')))
  2227. ++m_position;
  2228. if(m_position == m_end)
  2229. {
  2230. // Rewind to start of (? sequence:
  2231. --m_position;
  2232. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2233. fail(regex_constants::error_perl_extension, m_position - m_base);
  2234. return false;
  2235. }
  2236. v = static_cast<int>(hash_value_from_capture_name(base, m_position));
  2237. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
  2238. br->index = v;
  2239. if(((*m_position != charT('>')) && (*m_position != charT('\''))) || (++m_position == m_end))
  2240. {
  2241. // Rewind to start of (? sequence:
  2242. --m_position;
  2243. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2244. fail(regex_constants::error_perl_extension, m_position - m_base, "Unterminated named capture.");
  2245. return false;
  2246. }
  2247. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2248. {
  2249. // Rewind to start of (? sequence:
  2250. --m_position;
  2251. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2252. fail(regex_constants::error_perl_extension, m_position - m_base);
  2253. return false;
  2254. }
  2255. if(++m_position == m_end)
  2256. {
  2257. // Rewind to start of (? sequence:
  2258. --m_position;
  2259. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2260. fail(regex_constants::error_perl_extension, m_position - m_base);
  2261. return false;
  2262. }
  2263. }
  2264. else if(*m_position == charT('D'))
  2265. {
  2266. const char* def = "DEFINE";
  2267. while(*def && (m_position != m_end) && (*m_position == charT(*def)))
  2268. ++m_position, ++def;
  2269. if((m_position == m_end) || *def)
  2270. {
  2271. // Rewind to start of (? sequence:
  2272. --m_position;
  2273. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2274. fail(regex_constants::error_perl_extension, m_position - m_base);
  2275. return false;
  2276. }
  2277. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
  2278. br->index = 9999; // special magic value!
  2279. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2280. {
  2281. // Rewind to start of (? sequence:
  2282. --m_position;
  2283. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2284. fail(regex_constants::error_perl_extension, m_position - m_base);
  2285. return false;
  2286. }
  2287. if(++m_position == m_end)
  2288. {
  2289. // Rewind to start of (? sequence:
  2290. --m_position;
  2291. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2292. fail(regex_constants::error_perl_extension, m_position - m_base);
  2293. return false;
  2294. }
  2295. }
  2296. else if(v > 0)
  2297. {
  2298. re_brace* br = static_cast<re_brace*>(this->append_state(syntax_element_assert_backref, sizeof(re_brace)));
  2299. br->index = v;
  2300. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2301. {
  2302. // Rewind to start of (? sequence:
  2303. --m_position;
  2304. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2305. fail(regex_constants::error_perl_extension, m_position - m_base);
  2306. return false;
  2307. }
  2308. if(++m_position == m_end)
  2309. {
  2310. // Rewind to start of (? sequence:
  2311. --m_position;
  2312. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2313. fail(regex_constants::error_perl_extension, m_position - m_base);
  2314. return false;
  2315. }
  2316. }
  2317. else
  2318. {
  2319. // verify that we have a lookahead or lookbehind assert:
  2320. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_question)
  2321. {
  2322. // Rewind to start of (? sequence:
  2323. --m_position;
  2324. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2325. fail(regex_constants::error_perl_extension, m_position - m_base);
  2326. return false;
  2327. }
  2328. if(++m_position == m_end)
  2329. {
  2330. // Rewind to start of (? sequence:
  2331. --m_position;
  2332. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2333. fail(regex_constants::error_perl_extension, m_position - m_base);
  2334. return false;
  2335. }
  2336. if(this->m_traits.syntax_type(*m_position) == regex_constants::escape_type_left_word)
  2337. {
  2338. if(++m_position == m_end)
  2339. {
  2340. // Rewind to start of (? sequence:
  2341. --m_position;
  2342. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2343. fail(regex_constants::error_perl_extension, m_position - m_base);
  2344. return false;
  2345. }
  2346. if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
  2347. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
  2348. {
  2349. // Rewind to start of (? sequence:
  2350. --m_position;
  2351. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2352. fail(regex_constants::error_perl_extension, m_position - m_base);
  2353. return false;
  2354. }
  2355. m_position -= 3;
  2356. }
  2357. else
  2358. {
  2359. if((this->m_traits.syntax_type(*m_position) != regex_constants::syntax_equal)
  2360. && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_not))
  2361. {
  2362. // Rewind to start of (? sequence:
  2363. --m_position;
  2364. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2365. fail(regex_constants::error_perl_extension, m_position - m_base);
  2366. return false;
  2367. }
  2368. m_position -= 2;
  2369. }
  2370. }
  2371. break;
  2372. }
  2373. case regex_constants::syntax_close_mark:
  2374. // Rewind to start of (? sequence:
  2375. --m_position;
  2376. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2377. fail(regex_constants::error_perl_extension, m_position - m_base);
  2378. return false;
  2379. case regex_constants::escape_type_end_buffer:
  2380. {
  2381. name_delim = *m_position;
  2382. named_capture_jump:
  2383. markid = 0;
  2384. if(0 == (this->flags() & regbase::nosubs))
  2385. {
  2386. markid = ++m_mark_count;
  2387. #ifndef BOOST_NO_STD_DISTANCE
  2388. if(this->flags() & regbase::save_subexpression_location)
  2389. this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>(std::distance(m_base, m_position) - 2, 0));
  2390. #else
  2391. if(this->flags() & regbase::save_subexpression_location)
  2392. this->m_pdata->m_subs.push_back(std::pair<std::size_t, std::size_t>((m_position - m_base) - 2, 0));
  2393. #endif
  2394. }
  2395. pb->index = markid;
  2396. const charT* base = ++m_position;
  2397. if(m_position == m_end)
  2398. {
  2399. // Rewind to start of (? sequence:
  2400. --m_position;
  2401. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2402. fail(regex_constants::error_perl_extension, m_position - m_base);
  2403. return false;
  2404. }
  2405. while((m_position != m_end) && (*m_position != name_delim))
  2406. ++m_position;
  2407. if(m_position == m_end)
  2408. {
  2409. // Rewind to start of (? sequence:
  2410. --m_position;
  2411. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2412. fail(regex_constants::error_perl_extension, m_position - m_base);
  2413. return false;
  2414. }
  2415. this->m_pdata->set_name(base, m_position, markid);
  2416. ++m_position;
  2417. break;
  2418. }
  2419. default:
  2420. if(*m_position == charT('R'))
  2421. {
  2422. ++m_position;
  2423. v = 0;
  2424. if(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark)
  2425. {
  2426. // Rewind to start of (? sequence:
  2427. --m_position;
  2428. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2429. fail(regex_constants::error_perl_extension, m_position - m_base);
  2430. return false;
  2431. }
  2432. goto insert_recursion;
  2433. }
  2434. if(*m_position == charT('&'))
  2435. {
  2436. ++m_position;
  2437. const charT* base = m_position;
  2438. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2439. ++m_position;
  2440. if(m_position == m_end)
  2441. {
  2442. // Rewind to start of (? sequence:
  2443. --m_position;
  2444. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2445. fail(regex_constants::error_perl_extension, m_position - m_base);
  2446. return false;
  2447. }
  2448. v = static_cast<int>(hash_value_from_capture_name(base, m_position));
  2449. goto insert_recursion;
  2450. }
  2451. if(*m_position == charT('P'))
  2452. {
  2453. ++m_position;
  2454. if(m_position == m_end)
  2455. {
  2456. // Rewind to start of (? sequence:
  2457. --m_position;
  2458. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2459. fail(regex_constants::error_perl_extension, m_position - m_base);
  2460. return false;
  2461. }
  2462. if(*m_position == charT('>'))
  2463. {
  2464. ++m_position;
  2465. const charT* base = m_position;
  2466. while((m_position != m_end) && (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2467. ++m_position;
  2468. if(m_position == m_end)
  2469. {
  2470. // Rewind to start of (? sequence:
  2471. --m_position;
  2472. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2473. fail(regex_constants::error_perl_extension, m_position - m_base);
  2474. return false;
  2475. }
  2476. v = static_cast<int>(hash_value_from_capture_name(base, m_position));
  2477. goto insert_recursion;
  2478. }
  2479. }
  2480. //
  2481. // lets assume that we have a (?imsx) group and try and parse it:
  2482. //
  2483. option_group_jump:
  2484. regex_constants::syntax_option_type opts = parse_options();
  2485. if(m_position == m_end)
  2486. {
  2487. // Rewind to start of (? sequence:
  2488. --m_position;
  2489. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2490. fail(regex_constants::error_perl_extension, m_position - m_base);
  2491. return false;
  2492. }
  2493. // make a note of whether we have a case change:
  2494. m_has_case_change = ((opts & regbase::icase) != (this->flags() & regbase::icase));
  2495. pb->index = markid = 0;
  2496. if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark)
  2497. {
  2498. // update flags and carry on as normal:
  2499. this->flags(opts);
  2500. restore_flags = false;
  2501. old_case_change |= m_has_case_change; // defer end of scope by one ')'
  2502. }
  2503. else if(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_colon)
  2504. {
  2505. // update flags and carry on until the matching ')' is found:
  2506. this->flags(opts);
  2507. ++m_position;
  2508. }
  2509. else
  2510. {
  2511. // Rewind to start of (? sequence:
  2512. --m_position;
  2513. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2514. fail(regex_constants::error_perl_extension, m_position - m_base);
  2515. return false;
  2516. }
  2517. // finally append a case change state if we need it:
  2518. if(m_has_case_change)
  2519. {
  2520. static_cast<re_case*>(
  2521. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  2522. )->icase = opts & regbase::icase;
  2523. }
  2524. }
  2525. //
  2526. // now recursively add more states, this will terminate when we get to a
  2527. // matching ')' :
  2528. //
  2529. parse_all();
  2530. //
  2531. // Unwind alternatives:
  2532. //
  2533. if(0 == unwind_alts(last_paren_start))
  2534. {
  2535. // Rewind to start of (? sequence:
  2536. --m_position;
  2537. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2538. fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid alternation operators within (?...) block.");
  2539. return false;
  2540. }
  2541. //
  2542. // we either have a ')' or we have run out of characters prematurely:
  2543. //
  2544. if(m_position == m_end)
  2545. {
  2546. // Rewind to start of (? sequence:
  2547. --m_position;
  2548. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2549. this->fail(regex_constants::error_paren, ::boost::BOOST_REGEX_DETAIL_NS::distance(m_base, m_end));
  2550. return false;
  2551. }
  2552. BOOST_ASSERT(this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark);
  2553. ++m_position;
  2554. //
  2555. // restore the flags:
  2556. //
  2557. if(restore_flags)
  2558. {
  2559. // append a case change state if we need it:
  2560. if(m_has_case_change)
  2561. {
  2562. static_cast<re_case*>(
  2563. this->append_state(syntax_element_toggle_case, sizeof(re_case))
  2564. )->icase = old_flags & regbase::icase;
  2565. }
  2566. this->flags(old_flags);
  2567. }
  2568. //
  2569. // set up the jump pointer if we have one:
  2570. //
  2571. if(jump_offset)
  2572. {
  2573. this->m_pdata->m_data.align();
  2574. re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
  2575. jmp->alt.i = this->m_pdata->m_data.size() - this->getoffset(jmp);
  2576. if((this->m_last_state == jmp) && (markid != -2))
  2577. {
  2578. // Oops... we didn't have anything inside the assertion.
  2579. // Note we don't get here for negated forward lookahead as (?!)
  2580. // does have some uses.
  2581. // Rewind to start of (? sequence:
  2582. --m_position;
  2583. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2584. fail(regex_constants::error_perl_extension, m_position - m_base, "Invalid or empty zero width assertion.");
  2585. return false;
  2586. }
  2587. }
  2588. //
  2589. // verify that if this is conditional expression, that we do have
  2590. // an alternative, if not add one:
  2591. //
  2592. if(markid == -4)
  2593. {
  2594. re_syntax_base* b = this->getaddress(expected_alt_point);
  2595. // Make sure we have exactly one alternative following this state:
  2596. if(b->type != syntax_element_alt)
  2597. {
  2598. re_alt* alt = static_cast<re_alt*>(this->insert_state(expected_alt_point, syntax_element_alt, sizeof(re_alt)));
  2599. alt->alt.i = this->m_pdata->m_data.size() - this->getoffset(alt);
  2600. }
  2601. else if(((std::ptrdiff_t)this->m_pdata->m_data.size() > (static_cast<re_alt*>(b)->alt.i + this->getoffset(b))) && (static_cast<re_alt*>(b)->alt.i > 0) && this->getaddress(static_cast<re_alt*>(b)->alt.i, b)->type == syntax_element_alt)
  2602. {
  2603. // Can't have seen more than one alternative:
  2604. // Rewind to start of (? sequence:
  2605. --m_position;
  2606. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2607. fail(regex_constants::error_bad_pattern, m_position - m_base, "More than one alternation operator | was encountered inside a conditional expression.");
  2608. return false;
  2609. }
  2610. else
  2611. {
  2612. // We must *not* have seen an alternative inside a (DEFINE) block:
  2613. b = this->getaddress(b->next.i, b);
  2614. if((b->type == syntax_element_assert_backref) && (static_cast<re_brace*>(b)->index == 9999))
  2615. {
  2616. // Rewind to start of (? sequence:
  2617. --m_position;
  2618. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2619. fail(regex_constants::error_bad_pattern, m_position - m_base, "Alternation operators are not allowed inside a DEFINE block.");
  2620. return false;
  2621. }
  2622. }
  2623. // check for invalid repetition of next state:
  2624. b = this->getaddress(expected_alt_point);
  2625. b = this->getaddress(static_cast<re_alt*>(b)->next.i, b);
  2626. if((b->type != syntax_element_assert_backref)
  2627. && (b->type != syntax_element_startmark))
  2628. {
  2629. // Rewind to start of (? sequence:
  2630. --m_position;
  2631. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2632. fail(regex_constants::error_badrepeat, m_position - m_base, "A repetition operator cannot be applied to a zero-width assertion.");
  2633. return false;
  2634. }
  2635. }
  2636. //
  2637. // append closing parenthesis state:
  2638. //
  2639. pb = static_cast<re_brace*>(this->append_state(syntax_element_endmark, sizeof(re_brace)));
  2640. pb->index = markid;
  2641. pb->icase = this->flags() & regbase::icase;
  2642. this->m_paren_start = last_paren_start;
  2643. //
  2644. // restore the alternate insertion point:
  2645. //
  2646. this->m_alt_insert_point = last_alt_point;
  2647. //
  2648. // and the case change data:
  2649. //
  2650. m_has_case_change = old_case_change;
  2651. //
  2652. // And the mark_reset data:
  2653. //
  2654. if(m_max_mark > m_mark_count)
  2655. {
  2656. m_mark_count = m_max_mark;
  2657. }
  2658. m_mark_reset = mark_reset;
  2659. m_max_mark = max_mark;
  2660. if(markid > 0)
  2661. {
  2662. #ifndef BOOST_NO_STD_DISTANCE
  2663. if(this->flags() & regbase::save_subexpression_location)
  2664. this->m_pdata->m_subs.at(markid - 1).second = std::distance(m_base, m_position) - 1;
  2665. #else
  2666. if(this->flags() & regbase::save_subexpression_location)
  2667. this->m_pdata->m_subs.at(markid - 1).second = (m_position - m_base) - 1;
  2668. #endif
  2669. //
  2670. // allow backrefs to this mark:
  2671. //
  2672. if(markid < (int)(sizeof(unsigned) * CHAR_BIT))
  2673. this->m_backrefs |= 1u << (markid - 1);
  2674. }
  2675. return true;
  2676. }
  2677. template <class charT, class traits>
  2678. bool basic_regex_parser<charT, traits>::match_verb(const char* verb)
  2679. {
  2680. while(*verb)
  2681. {
  2682. if(static_cast<charT>(*verb) != *m_position)
  2683. {
  2684. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2685. fail(regex_constants::error_perl_extension, m_position - m_base);
  2686. return false;
  2687. }
  2688. if(++m_position == m_end)
  2689. {
  2690. --m_position;
  2691. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2692. fail(regex_constants::error_perl_extension, m_position - m_base);
  2693. return false;
  2694. }
  2695. ++verb;
  2696. }
  2697. return true;
  2698. }
  2699. template <class charT, class traits>
  2700. bool basic_regex_parser<charT, traits>::parse_perl_verb()
  2701. {
  2702. if(++m_position == m_end)
  2703. {
  2704. // Rewind to start of (* sequence:
  2705. --m_position;
  2706. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2707. fail(regex_constants::error_perl_extension, m_position - m_base);
  2708. return false;
  2709. }
  2710. switch(*m_position)
  2711. {
  2712. case 'F':
  2713. if(++m_position == m_end)
  2714. {
  2715. // Rewind to start of (* sequence:
  2716. --m_position;
  2717. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2718. fail(regex_constants::error_perl_extension, m_position - m_base);
  2719. return false;
  2720. }
  2721. if((this->m_traits.syntax_type(*m_position) == regex_constants::syntax_close_mark) || match_verb("AIL"))
  2722. {
  2723. if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2724. {
  2725. // Rewind to start of (* sequence:
  2726. --m_position;
  2727. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2728. fail(regex_constants::error_perl_extension, m_position - m_base);
  2729. return false;
  2730. }
  2731. ++m_position;
  2732. this->append_state(syntax_element_fail);
  2733. return true;
  2734. }
  2735. break;
  2736. case 'A':
  2737. if(++m_position == m_end)
  2738. {
  2739. // Rewind to start of (* sequence:
  2740. --m_position;
  2741. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2742. fail(regex_constants::error_perl_extension, m_position - m_base);
  2743. return false;
  2744. }
  2745. if(match_verb("CCEPT"))
  2746. {
  2747. if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2748. {
  2749. // Rewind to start of (* sequence:
  2750. --m_position;
  2751. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2752. fail(regex_constants::error_perl_extension, m_position - m_base);
  2753. return false;
  2754. }
  2755. ++m_position;
  2756. this->append_state(syntax_element_accept);
  2757. return true;
  2758. }
  2759. break;
  2760. case 'C':
  2761. if(++m_position == m_end)
  2762. {
  2763. // Rewind to start of (* sequence:
  2764. --m_position;
  2765. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2766. fail(regex_constants::error_perl_extension, m_position - m_base);
  2767. return false;
  2768. }
  2769. if(match_verb("OMMIT"))
  2770. {
  2771. if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2772. {
  2773. // Rewind to start of (* sequence:
  2774. --m_position;
  2775. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2776. fail(regex_constants::error_perl_extension, m_position - m_base);
  2777. return false;
  2778. }
  2779. ++m_position;
  2780. static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_commit;
  2781. this->m_pdata->m_disable_match_any = true;
  2782. return true;
  2783. }
  2784. break;
  2785. case 'P':
  2786. if(++m_position == m_end)
  2787. {
  2788. // Rewind to start of (* sequence:
  2789. --m_position;
  2790. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2791. fail(regex_constants::error_perl_extension, m_position - m_base);
  2792. return false;
  2793. }
  2794. if(match_verb("RUNE"))
  2795. {
  2796. if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2797. {
  2798. // Rewind to start of (* sequence:
  2799. --m_position;
  2800. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2801. fail(regex_constants::error_perl_extension, m_position - m_base);
  2802. return false;
  2803. }
  2804. ++m_position;
  2805. static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_prune;
  2806. this->m_pdata->m_disable_match_any = true;
  2807. return true;
  2808. }
  2809. break;
  2810. case 'S':
  2811. if(++m_position == m_end)
  2812. {
  2813. // Rewind to start of (* sequence:
  2814. --m_position;
  2815. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2816. fail(regex_constants::error_perl_extension, m_position - m_base);
  2817. return false;
  2818. }
  2819. if(match_verb("KIP"))
  2820. {
  2821. if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2822. {
  2823. // Rewind to start of (* sequence:
  2824. --m_position;
  2825. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2826. fail(regex_constants::error_perl_extension, m_position - m_base);
  2827. return false;
  2828. }
  2829. ++m_position;
  2830. static_cast<re_commit*>(this->append_state(syntax_element_commit, sizeof(re_commit)))->action = commit_skip;
  2831. this->m_pdata->m_disable_match_any = true;
  2832. return true;
  2833. }
  2834. break;
  2835. case 'T':
  2836. if(++m_position == m_end)
  2837. {
  2838. // Rewind to start of (* sequence:
  2839. --m_position;
  2840. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2841. fail(regex_constants::error_perl_extension, m_position - m_base);
  2842. return false;
  2843. }
  2844. if(match_verb("HEN"))
  2845. {
  2846. if((m_position == m_end) || (this->m_traits.syntax_type(*m_position) != regex_constants::syntax_close_mark))
  2847. {
  2848. // Rewind to start of (* sequence:
  2849. --m_position;
  2850. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2851. fail(regex_constants::error_perl_extension, m_position - m_base);
  2852. return false;
  2853. }
  2854. ++m_position;
  2855. this->append_state(syntax_element_then);
  2856. this->m_pdata->m_disable_match_any = true;
  2857. return true;
  2858. }
  2859. break;
  2860. }
  2861. // Rewind to start of (* sequence:
  2862. --m_position;
  2863. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2864. fail(regex_constants::error_perl_extension, m_position - m_base);
  2865. return false;
  2866. }
  2867. template <class charT, class traits>
  2868. bool basic_regex_parser<charT, traits>::add_emacs_code(bool negate)
  2869. {
  2870. //
  2871. // parses an emacs style \sx or \Sx construct.
  2872. //
  2873. if(++m_position == m_end)
  2874. {
  2875. // Rewind to start of sequence:
  2876. --m_position;
  2877. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_escape) --m_position;
  2878. fail(regex_constants::error_escape, m_position - m_base);
  2879. return false;
  2880. }
  2881. basic_char_set<charT, traits> char_set;
  2882. if(negate)
  2883. char_set.negate();
  2884. static const charT s_punct[5] = { 'p', 'u', 'n', 'c', 't', };
  2885. switch(*m_position)
  2886. {
  2887. case 's':
  2888. case ' ':
  2889. char_set.add_class(this->m_mask_space);
  2890. break;
  2891. case 'w':
  2892. char_set.add_class(this->m_word_mask);
  2893. break;
  2894. case '_':
  2895. char_set.add_single(digraph<charT>(charT('$')));
  2896. char_set.add_single(digraph<charT>(charT('&')));
  2897. char_set.add_single(digraph<charT>(charT('*')));
  2898. char_set.add_single(digraph<charT>(charT('+')));
  2899. char_set.add_single(digraph<charT>(charT('-')));
  2900. char_set.add_single(digraph<charT>(charT('_')));
  2901. char_set.add_single(digraph<charT>(charT('<')));
  2902. char_set.add_single(digraph<charT>(charT('>')));
  2903. break;
  2904. case '.':
  2905. char_set.add_class(this->m_traits.lookup_classname(s_punct, s_punct+5));
  2906. break;
  2907. case '(':
  2908. char_set.add_single(digraph<charT>(charT('(')));
  2909. char_set.add_single(digraph<charT>(charT('[')));
  2910. char_set.add_single(digraph<charT>(charT('{')));
  2911. break;
  2912. case ')':
  2913. char_set.add_single(digraph<charT>(charT(')')));
  2914. char_set.add_single(digraph<charT>(charT(']')));
  2915. char_set.add_single(digraph<charT>(charT('}')));
  2916. break;
  2917. case '"':
  2918. char_set.add_single(digraph<charT>(charT('"')));
  2919. char_set.add_single(digraph<charT>(charT('\'')));
  2920. char_set.add_single(digraph<charT>(charT('`')));
  2921. break;
  2922. case '\'':
  2923. char_set.add_single(digraph<charT>(charT('\'')));
  2924. char_set.add_single(digraph<charT>(charT(',')));
  2925. char_set.add_single(digraph<charT>(charT('#')));
  2926. break;
  2927. case '<':
  2928. char_set.add_single(digraph<charT>(charT(';')));
  2929. break;
  2930. case '>':
  2931. char_set.add_single(digraph<charT>(charT('\n')));
  2932. char_set.add_single(digraph<charT>(charT('\f')));
  2933. break;
  2934. default:
  2935. fail(regex_constants::error_ctype, m_position - m_base);
  2936. return false;
  2937. }
  2938. if(0 == this->append_set(char_set))
  2939. {
  2940. fail(regex_constants::error_ctype, m_position - m_base);
  2941. return false;
  2942. }
  2943. ++m_position;
  2944. return true;
  2945. }
  2946. template <class charT, class traits>
  2947. regex_constants::syntax_option_type basic_regex_parser<charT, traits>::parse_options()
  2948. {
  2949. // we have a (?imsx-imsx) group, convert it into a set of flags:
  2950. regex_constants::syntax_option_type f = this->flags();
  2951. bool breakout = false;
  2952. do
  2953. {
  2954. switch(*m_position)
  2955. {
  2956. case 's':
  2957. f |= regex_constants::mod_s;
  2958. f &= ~regex_constants::no_mod_s;
  2959. break;
  2960. case 'm':
  2961. f &= ~regex_constants::no_mod_m;
  2962. break;
  2963. case 'i':
  2964. f |= regex_constants::icase;
  2965. break;
  2966. case 'x':
  2967. f |= regex_constants::mod_x;
  2968. break;
  2969. default:
  2970. breakout = true;
  2971. continue;
  2972. }
  2973. if(++m_position == m_end)
  2974. {
  2975. // Rewind to start of (? sequence:
  2976. --m_position;
  2977. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2978. fail(regex_constants::error_paren, m_position - m_base);
  2979. return false;
  2980. }
  2981. }
  2982. while(!breakout);
  2983. breakout = false;
  2984. if(*m_position == static_cast<charT>('-'))
  2985. {
  2986. if(++m_position == m_end)
  2987. {
  2988. // Rewind to start of (? sequence:
  2989. --m_position;
  2990. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  2991. fail(regex_constants::error_paren, m_position - m_base);
  2992. return false;
  2993. }
  2994. do
  2995. {
  2996. switch(*m_position)
  2997. {
  2998. case 's':
  2999. f &= ~regex_constants::mod_s;
  3000. f |= regex_constants::no_mod_s;
  3001. break;
  3002. case 'm':
  3003. f |= regex_constants::no_mod_m;
  3004. break;
  3005. case 'i':
  3006. f &= ~regex_constants::icase;
  3007. break;
  3008. case 'x':
  3009. f &= ~regex_constants::mod_x;
  3010. break;
  3011. default:
  3012. breakout = true;
  3013. continue;
  3014. }
  3015. if(++m_position == m_end)
  3016. {
  3017. // Rewind to start of (? sequence:
  3018. --m_position;
  3019. while(this->m_traits.syntax_type(*m_position) != regex_constants::syntax_open_mark) --m_position;
  3020. fail(regex_constants::error_paren, m_position - m_base);
  3021. return false;
  3022. }
  3023. }
  3024. while(!breakout);
  3025. }
  3026. return f;
  3027. }
  3028. template <class charT, class traits>
  3029. bool basic_regex_parser<charT, traits>::unwind_alts(std::ptrdiff_t last_paren_start)
  3030. {
  3031. //
  3032. // If we didn't actually add any states after the last
  3033. // alternative then that's an error:
  3034. //
  3035. if((this->m_alt_insert_point == static_cast<std::ptrdiff_t>(this->m_pdata->m_data.size()))
  3036. && m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start)
  3037. &&
  3038. !(
  3039. ((this->flags() & regbase::main_option_type) == regbase::perl_syntax_group)
  3040. &&
  3041. ((this->flags() & regbase::no_empty_expressions) == 0)
  3042. )
  3043. )
  3044. {
  3045. fail(regex_constants::error_empty, this->m_position - this->m_base, "Can't terminate a sub-expression with an alternation operator |.");
  3046. return false;
  3047. }
  3048. //
  3049. // Fix up our alternatives:
  3050. //
  3051. while(m_alt_jumps.size() && (m_alt_jumps.back() > last_paren_start))
  3052. {
  3053. //
  3054. // fix up the jump to point to the end of the states
  3055. // that we've just added:
  3056. //
  3057. std::ptrdiff_t jump_offset = m_alt_jumps.back();
  3058. m_alt_jumps.pop_back();
  3059. this->m_pdata->m_data.align();
  3060. re_jump* jmp = static_cast<re_jump*>(this->getaddress(jump_offset));
  3061. BOOST_ASSERT(jmp->type == syntax_element_jump);
  3062. jmp->alt.i = this->m_pdata->m_data.size() - jump_offset;
  3063. }
  3064. return true;
  3065. }
  3066. #ifdef BOOST_MSVC
  3067. #pragma warning(pop)
  3068. #endif
  3069. } // namespace BOOST_REGEX_DETAIL_NS
  3070. } // namespace boost
  3071. #ifdef BOOST_MSVC
  3072. #pragma warning(push)
  3073. #pragma warning(disable: 4103)
  3074. #endif
  3075. #ifdef BOOST_HAS_ABI_HEADERS
  3076. # include BOOST_ABI_SUFFIX
  3077. #endif
  3078. #ifdef BOOST_MSVC
  3079. #pragma warning(pop)
  3080. #endif
  3081. #endif