index.hpp 46 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See
  5. // accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
  9. #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
  10. #include <boost/locale/config.hpp>
  11. #include <boost/locale/boundary/types.hpp>
  12. #include <boost/locale/boundary/facets.hpp>
  13. #include <boost/locale/boundary/segment.hpp>
  14. #include <boost/locale/boundary/boundary_point.hpp>
  15. #include <boost/iterator/iterator_facade.hpp>
  16. #include <boost/type_traits/is_same.hpp>
  17. #include <boost/shared_ptr.hpp>
  18. #include <boost/cstdint.hpp>
  19. #include <boost/assert.hpp>
  20. #ifdef BOOST_MSVC
  21. # pragma warning(push)
  22. # pragma warning(disable : 4275 4251 4231 4660)
  23. #endif
  24. #include <string>
  25. #include <locale>
  26. #include <vector>
  27. #include <iterator>
  28. #include <algorithm>
  29. #include <stdexcept>
  30. #include <iostream>
  31. namespace boost {
  32. namespace locale {
  33. namespace boundary {
  34. ///
  35. /// \defgroup boundary Boundary Analysis
  36. ///
  37. /// This module contains all operations required for %boundary analysis of text: character, word, like and sentence boundaries
  38. ///
  39. /// @{
  40. ///
  41. /// \cond INTERNAL
  42. namespace details {
  43. template<typename IteratorType,typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category>
  44. struct mapping_traits {
  45. typedef typename std::iterator_traits<IteratorType>::value_type char_type;
  46. static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l)
  47. {
  48. std::basic_string<char_type> str(b,e);
  49. return std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size());
  50. }
  51. };
  52. template<typename CharType,typename SomeIteratorType>
  53. struct linear_iterator_traits {
  54. static const bool is_linear =
  55. is_same<SomeIteratorType,CharType*>::value
  56. || is_same<SomeIteratorType,CharType const*>::value
  57. || is_same<SomeIteratorType,typename std::basic_string<CharType>::iterator>::value
  58. || is_same<SomeIteratorType,typename std::basic_string<CharType>::const_iterator>::value
  59. || is_same<SomeIteratorType,typename std::vector<CharType>::iterator>::value
  60. || is_same<SomeIteratorType,typename std::vector<CharType>::const_iterator>::value
  61. ;
  62. };
  63. template<typename IteratorType>
  64. struct mapping_traits<IteratorType,std::random_access_iterator_tag> {
  65. typedef typename std::iterator_traits<IteratorType>::value_type char_type;
  66. static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l)
  67. {
  68. index_type result;
  69. //
  70. // Optimize for most common cases
  71. //
  72. // C++0x requires that string is continious in memory and all known
  73. // string implementations
  74. // do this because of c_str() support.
  75. //
  76. if(linear_iterator_traits<char_type,IteratorType>::is_linear && b!=e)
  77. {
  78. char_type const *begin = &*b;
  79. char_type const *end = begin + (e-b);
  80. index_type tmp=std::use_facet<boundary_indexing<char_type> >(l).map(t,begin,end);
  81. result.swap(tmp);
  82. }
  83. else {
  84. std::basic_string<char_type> str(b,e);
  85. index_type tmp = std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size());
  86. result.swap(tmp);
  87. }
  88. return result;
  89. }
  90. };
  91. template<typename BaseIterator>
  92. class mapping {
  93. public:
  94. typedef BaseIterator base_iterator;
  95. typedef typename std::iterator_traits<base_iterator>::value_type char_type;
  96. mapping(boundary_type type,
  97. base_iterator begin,
  98. base_iterator end,
  99. std::locale const &loc)
  100. :
  101. index_(new index_type()),
  102. begin_(begin),
  103. end_(end)
  104. {
  105. index_type idx=details::mapping_traits<base_iterator>::map(type,begin,end,loc);
  106. index_->swap(idx);
  107. }
  108. mapping()
  109. {
  110. }
  111. index_type const &index() const
  112. {
  113. return *index_;
  114. }
  115. base_iterator begin() const
  116. {
  117. return begin_;
  118. }
  119. base_iterator end() const
  120. {
  121. return end_;
  122. }
  123. private:
  124. boost::shared_ptr<index_type> index_;
  125. base_iterator begin_,end_;
  126. };
  127. template<typename BaseIterator>
  128. class segment_index_iterator :
  129. public boost::iterator_facade<
  130. segment_index_iterator<BaseIterator>,
  131. segment<BaseIterator>,
  132. boost::bidirectional_traversal_tag,
  133. segment<BaseIterator> const &
  134. >
  135. {
  136. public:
  137. typedef BaseIterator base_iterator;
  138. typedef mapping<base_iterator> mapping_type;
  139. typedef segment<base_iterator> segment_type;
  140. segment_index_iterator() : current_(0,0),map_(0)
  141. {
  142. }
  143. segment_index_iterator(base_iterator p,mapping_type const *map,rule_type mask,bool full_select) :
  144. map_(map),
  145. mask_(mask),
  146. full_select_(full_select)
  147. {
  148. set(p);
  149. }
  150. segment_index_iterator(bool is_begin,mapping_type const *map,rule_type mask,bool full_select) :
  151. map_(map),
  152. mask_(mask),
  153. full_select_(full_select)
  154. {
  155. if(is_begin)
  156. set_begin();
  157. else
  158. set_end();
  159. }
  160. segment_type const &dereference() const
  161. {
  162. return value_;
  163. }
  164. bool equal(segment_index_iterator const &other) const
  165. {
  166. return map_ == other.map_ && current_.second == other.current_.second;
  167. }
  168. void increment()
  169. {
  170. std::pair<size_t,size_t> next = current_;
  171. if(full_select_) {
  172. next.first = next.second;
  173. while(next.second < size()) {
  174. next.second++;
  175. if(valid_offset(next.second))
  176. break;
  177. }
  178. if(next.second == size())
  179. next.first = next.second - 1;
  180. }
  181. else {
  182. while(next.second < size()) {
  183. next.first = next.second;
  184. next.second++;
  185. if(valid_offset(next.second))
  186. break;
  187. }
  188. }
  189. update_current(next);
  190. }
  191. void decrement()
  192. {
  193. std::pair<size_t,size_t> next = current_;
  194. if(full_select_) {
  195. while(next.second >1) {
  196. next.second--;
  197. if(valid_offset(next.second))
  198. break;
  199. }
  200. next.first = next.second;
  201. while(next.first >0) {
  202. next.first--;
  203. if(valid_offset(next.first))
  204. break;
  205. }
  206. }
  207. else {
  208. while(next.second >1) {
  209. next.second--;
  210. if(valid_offset(next.second))
  211. break;
  212. }
  213. next.first = next.second - 1;
  214. }
  215. update_current(next);
  216. }
  217. private:
  218. void set_end()
  219. {
  220. current_.first = size() - 1;
  221. current_.second = size();
  222. value_ = segment_type(map_->end(),map_->end(),0);
  223. }
  224. void set_begin()
  225. {
  226. current_.first = current_.second = 0;
  227. value_ = segment_type(map_->begin(),map_->begin(),0);
  228. increment();
  229. }
  230. void set(base_iterator p)
  231. {
  232. size_t dist=std::distance(map_->begin(),p);
  233. index_type::const_iterator b=map_->index().begin(),e=map_->index().end();
  234. index_type::const_iterator
  235. boundary_point=std::upper_bound(b,e,break_info(dist));
  236. while(boundary_point != e && (boundary_point->rule & mask_)==0)
  237. boundary_point++;
  238. current_.first = current_.second = boundary_point - b;
  239. if(full_select_) {
  240. while(current_.first > 0) {
  241. current_.first --;
  242. if(valid_offset(current_.first))
  243. break;
  244. }
  245. }
  246. else {
  247. if(current_.first > 0)
  248. current_.first --;
  249. }
  250. value_.first = map_->begin();
  251. std::advance(value_.first,get_offset(current_.first));
  252. value_.second = value_.first;
  253. std::advance(value_.second,get_offset(current_.second) - get_offset(current_.first));
  254. update_rule();
  255. }
  256. void update_current(std::pair<size_t,size_t> pos)
  257. {
  258. std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first);
  259. std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second);
  260. std::advance(value_.first,first_diff);
  261. std::advance(value_.second,second_diff);
  262. current_ = pos;
  263. update_rule();
  264. }
  265. void update_rule()
  266. {
  267. if(current_.second != size()) {
  268. value_.rule(index()[current_.second].rule);
  269. }
  270. }
  271. size_t get_offset(size_t ind) const
  272. {
  273. if(ind == size())
  274. return index().back().offset;
  275. return index()[ind].offset;
  276. }
  277. bool valid_offset(size_t offset) const
  278. {
  279. return offset == 0
  280. || offset == size() // make sure we not acess index[size]
  281. || (index()[offset].rule & mask_)!=0;
  282. }
  283. size_t size() const
  284. {
  285. return index().size();
  286. }
  287. index_type const &index() const
  288. {
  289. return map_->index();
  290. }
  291. segment_type value_;
  292. std::pair<size_t,size_t> current_;
  293. mapping_type const *map_;
  294. rule_type mask_;
  295. bool full_select_;
  296. };
  297. template<typename BaseIterator>
  298. class boundary_point_index_iterator :
  299. public boost::iterator_facade<
  300. boundary_point_index_iterator<BaseIterator>,
  301. boundary_point<BaseIterator>,
  302. boost::bidirectional_traversal_tag,
  303. boundary_point<BaseIterator> const &
  304. >
  305. {
  306. public:
  307. typedef BaseIterator base_iterator;
  308. typedef mapping<base_iterator> mapping_type;
  309. typedef boundary_point<base_iterator> boundary_point_type;
  310. boundary_point_index_iterator() : current_(0),map_(0)
  311. {
  312. }
  313. boundary_point_index_iterator(bool is_begin,mapping_type const *map,rule_type mask) :
  314. map_(map),
  315. mask_(mask)
  316. {
  317. if(is_begin)
  318. set_begin();
  319. else
  320. set_end();
  321. }
  322. boundary_point_index_iterator(base_iterator p,mapping_type const *map,rule_type mask) :
  323. map_(map),
  324. mask_(mask)
  325. {
  326. set(p);
  327. }
  328. boundary_point_type const &dereference() const
  329. {
  330. return value_;
  331. }
  332. bool equal(boundary_point_index_iterator const &other) const
  333. {
  334. return map_ == other.map_ && current_ == other.current_;
  335. }
  336. void increment()
  337. {
  338. size_t next = current_;
  339. while(next < size()) {
  340. next++;
  341. if(valid_offset(next))
  342. break;
  343. }
  344. update_current(next);
  345. }
  346. void decrement()
  347. {
  348. size_t next = current_;
  349. while(next>0) {
  350. next--;
  351. if(valid_offset(next))
  352. break;
  353. }
  354. update_current(next);
  355. }
  356. private:
  357. void set_end()
  358. {
  359. current_ = size();
  360. value_ = boundary_point_type(map_->end(),0);
  361. }
  362. void set_begin()
  363. {
  364. current_ = 0;
  365. value_ = boundary_point_type(map_->begin(),0);
  366. }
  367. void set(base_iterator p)
  368. {
  369. size_t dist = std::distance(map_->begin(),p);
  370. index_type::const_iterator b=index().begin();
  371. index_type::const_iterator e=index().end();
  372. index_type::const_iterator ptr = std::lower_bound(b,e,break_info(dist));
  373. if(ptr==index().end())
  374. current_=size()-1;
  375. else
  376. current_=ptr - index().begin();
  377. while(!valid_offset(current_))
  378. current_ ++;
  379. std::ptrdiff_t diff = get_offset(current_) - dist;
  380. std::advance(p,diff);
  381. value_.iterator(p);
  382. update_rule();
  383. }
  384. void update_current(size_t pos)
  385. {
  386. std::ptrdiff_t diff = get_offset(pos) - get_offset(current_);
  387. base_iterator i=value_.iterator();
  388. std::advance(i,diff);
  389. current_ = pos;
  390. value_.iterator(i);
  391. update_rule();
  392. }
  393. void update_rule()
  394. {
  395. if(current_ != size()) {
  396. value_.rule(index()[current_].rule);
  397. }
  398. }
  399. size_t get_offset(size_t ind) const
  400. {
  401. if(ind == size())
  402. return index().back().offset;
  403. return index()[ind].offset;
  404. }
  405. bool valid_offset(size_t offset) const
  406. {
  407. return offset == 0
  408. || offset + 1 >= size() // last and first are always valid regardless of mark
  409. || (index()[offset].rule & mask_)!=0;
  410. }
  411. size_t size() const
  412. {
  413. return index().size();
  414. }
  415. index_type const &index() const
  416. {
  417. return map_->index();
  418. }
  419. boundary_point_type value_;
  420. size_t current_;
  421. mapping_type const *map_;
  422. rule_type mask_;
  423. };
  424. } // details
  425. /// \endcond
  426. template<typename BaseIterator>
  427. class segment_index;
  428. template<typename BaseIterator>
  429. class boundary_point_index;
  430. ///
  431. /// \brief This class holds an index of segments in the text range and allows to iterate over them
  432. ///
  433. /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
  434. /// to the \ref segment objects.
  435. ///
  436. /// It provides two options on way of selecting segments:
  437. ///
  438. /// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to
  439. /// various masks %as \ref word_any.
  440. /// \n
  441. /// The default is to select any types of boundaries.
  442. /// \n
  443. /// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators
  444. /// would iterate only over the words containing Kana letters and \ref word_any would select all types of
  445. /// words excluding ranges that consist of white space and punctuation marks. So iterating over the text
  446. /// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be", instead
  447. /// of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?".
  448. /// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous
  449. /// %boundary point does not fit the selected rule.
  450. /// \n
  451. /// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?".
  452. /// \n
  453. /// This text contains three %boundary points separating it to sentences by different rules:
  454. /// - The exclamation mark "!" ends the sentence "Hello!"
  455. /// - The line feed that splits the sentence "How\nare you?" into two parts.
  456. /// - The question mark that ends the second sentence.
  457. /// \n
  458. /// If you would only change the \ref rule() to \ref sentence_term then the segment_index would
  459. /// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required
  460. /// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include
  461. /// all the text up to previous valid %boundary point and would return two expected sentences:
  462. /// "Hello!" and "How\nare you?".
  463. ///
  464. /// This class allows to find a segment according to the given iterator in range using \ref find() member
  465. /// function.
  466. ///
  467. /// \note
  468. ///
  469. /// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text
  470. /// invalidates existing iterators and they can't be used any more.
  471. /// - segment_index can be created from boundary_point_index or other segment_index that was created with
  472. /// same \ref boundary_type. This is very fast operation %as they shared same index
  473. /// and it does not require its regeneration.
  474. ///
  475. /// \see
  476. ///
  477. /// - \ref boundary_point_index
  478. /// - \ref segment
  479. /// - \ref boundary_point
  480. ///
  481. template<typename BaseIterator>
  482. class segment_index {
  483. public:
  484. ///
  485. /// The type of the iterator used to iterate over the original text
  486. ///
  487. typedef BaseIterator base_iterator;
  488. #ifdef BOOST_LOCALE_DOXYGEN
  489. ///
  490. /// The bidirectional iterator that iterates over \ref value_type objects.
  491. ///
  492. /// - The iterators may be invalidated by use of any non-const member function
  493. /// including but not limited to \ref rule(rule_type) and \ref full_select(bool).
  494. /// - The returned value_type object is valid %as long %as iterator points to it.
  495. /// So this following code is wrong %as t used after p was updated:
  496. /// \code
  497. /// segment_index<some_iterator>::iterator p=index.begin();
  498. /// segment<some_iterator> &t = *p;
  499. /// ++p;
  500. /// cout << t.str() << endl;
  501. /// \endcode
  502. ///
  503. typedef unspecified_iterator_type iterator;
  504. ///
  505. /// \copydoc iterator
  506. ///
  507. typedef unspecified_iterator_type const_iterator;
  508. #else
  509. typedef details::segment_index_iterator<base_iterator> iterator;
  510. typedef details::segment_index_iterator<base_iterator> const_iterator;
  511. #endif
  512. ///
  513. /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
  514. /// an object that represents selected segment.
  515. ///
  516. typedef segment<base_iterator> value_type;
  517. ///
  518. /// Default constructor.
  519. ///
  520. /// \note
  521. ///
  522. /// When this object is constructed by default it does not include a valid index, thus
  523. /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
  524. /// behavior
  525. ///
  526. segment_index() : mask_(0xFFFFFFFFu),full_select_(false)
  527. {
  528. }
  529. ///
  530. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  531. /// in range [begin,end) using a rule \a mask for locale \a loc.
  532. ///
  533. segment_index(boundary_type type,
  534. base_iterator begin,
  535. base_iterator end,
  536. rule_type mask,
  537. std::locale const &loc=std::locale())
  538. :
  539. map_(type,begin,end,loc),
  540. mask_(mask),
  541. full_select_(false)
  542. {
  543. }
  544. ///
  545. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  546. /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc.
  547. ///
  548. segment_index(boundary_type type,
  549. base_iterator begin,
  550. base_iterator end,
  551. std::locale const &loc=std::locale())
  552. :
  553. map_(type,begin,end,loc),
  554. mask_(0xFFFFFFFFu),
  555. full_select_(false)
  556. {
  557. }
  558. ///
  559. /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information
  560. /// and used default rule (all possible segments)
  561. ///
  562. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  563. /// range it is much better to create one from another rather then indexing the same
  564. /// range twice.
  565. ///
  566. /// \note \ref rule() flags are not copied
  567. ///
  568. segment_index(boundary_point_index<base_iterator> const &);
  569. ///
  570. /// Copy an index from a \ref boundary_point_index. It copies all indexing information
  571. /// and uses the default rule (all possible segments)
  572. ///
  573. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  574. /// range it is much better to create one from another rather then indexing the same
  575. /// range twice.
  576. ///
  577. /// \note \ref rule() flags are not copied
  578. ///
  579. segment_index const &operator = (boundary_point_index<base_iterator> const &);
  580. ///
  581. /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
  582. /// in range [begin,end) for locale \a loc.
  583. ///
  584. /// \note \ref rule() and \ref full_select() remain unchanged.
  585. ///
  586. void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale())
  587. {
  588. map_ = mapping_type(type,begin,end,loc);
  589. }
  590. ///
  591. /// Get the \ref iterator on the beginning of the segments range.
  592. ///
  593. /// Preconditions: the segment_index should have a mapping
  594. ///
  595. /// \note
  596. ///
  597. /// The returned iterator is invalidated by access to any non-const member functions of this object
  598. ///
  599. iterator begin() const
  600. {
  601. return iterator(true,&map_,mask_,full_select_);
  602. }
  603. ///
  604. /// Get the \ref iterator on the ending of the segments range.
  605. ///
  606. /// Preconditions: the segment_index should have a mapping
  607. ///
  608. /// The returned iterator is invalidated by access to any non-const member functions of this object
  609. ///
  610. iterator end() const
  611. {
  612. return iterator(false,&map_,mask_,full_select_);
  613. }
  614. ///
  615. /// Find a first valid segment following a position \a p.
  616. ///
  617. /// If \a p is inside a valid segment this segment is selected:
  618. ///
  619. /// For example: For \ref word %boundary analysis with \ref word_any rule():
  620. ///
  621. /// - "to| be or ", would point to "be",
  622. /// - "t|o be or ", would point to "to",
  623. /// - "to be or| ", would point to end.
  624. ///
  625. ///
  626. /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator
  627. /// to the text in the mapped range.
  628. ///
  629. /// The returned iterator is invalidated by access to any non-const member functions of this object
  630. ///
  631. iterator find(base_iterator p) const
  632. {
  633. return iterator(p,&map_,mask_,full_select_);
  634. }
  635. ///
  636. /// Get the mask of rules that are used
  637. ///
  638. rule_type rule() const
  639. {
  640. return mask_;
  641. }
  642. ///
  643. /// Set the mask of rules that are used
  644. ///
  645. void rule(rule_type v)
  646. {
  647. mask_ = v;
  648. }
  649. ///
  650. /// Get the full_select property value - should segment include in the range
  651. /// values that not belong to specific \ref rule() or not.
  652. ///
  653. /// The default value is false.
  654. ///
  655. /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
  656. /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
  657. /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
  658. /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
  659. /// following part "are you?"
  660. ///
  661. bool full_select() const
  662. {
  663. return full_select_;
  664. }
  665. ///
  666. /// Set the full_select property value - should segment include in the range
  667. /// values that not belong to specific \ref rule() or not.
  668. ///
  669. /// The default value is false.
  670. ///
  671. /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
  672. /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
  673. /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
  674. /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
  675. /// following part "are you?"
  676. ///
  677. void full_select(bool v)
  678. {
  679. full_select_ = v;
  680. }
  681. private:
  682. friend class boundary_point_index<base_iterator>;
  683. typedef details::mapping<base_iterator> mapping_type;
  684. mapping_type map_;
  685. rule_type mask_;
  686. bool full_select_;
  687. };
  688. ///
  689. /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating
  690. /// over them.
  691. ///
  692. /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
  693. /// to the \ref boundary_point objects.
  694. ///
  695. /// It provides an option that affects selecting %boundary points according to different rules:
  696. /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific
  697. /// types of %boundary points like \ref sentence_term.
  698. ///
  699. /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default
  700. /// rule is used the %boundary points would be:
  701. ///
  702. /// - "|Hello! How\nare you?"
  703. /// - "Hello! |How\nare you?"
  704. /// - "Hello! How\n|are you?"
  705. /// - "Hello! How\nare you?|"
  706. ///
  707. /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be:
  708. ///
  709. /// - "|Hello! How\nare you?"
  710. /// - "Hello! |How\nare you?"
  711. /// - "Hello! How\nare you?|"
  712. ///
  713. /// Such that a %boundary point defined by a line feed character would be ignored.
  714. ///
  715. /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member
  716. /// function.
  717. ///
  718. /// \note
  719. /// - Even an empty text range [x,x) considered to have a one %boundary point x.
  720. /// - \a a and \a b points of the range [a,b) are always considered %boundary points
  721. /// regardless the rules used.
  722. /// - Changing any of the option \ref rule() or course re-indexing the text
  723. /// invalidates existing iterators and they can't be used any more.
  724. /// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with
  725. /// same \ref boundary_type. This is very fast operation %as they shared same index
  726. /// and it does not require its regeneration.
  727. ///
  728. /// \see
  729. ///
  730. /// - \ref segment_index
  731. /// - \ref boundary_point
  732. /// - \ref segment
  733. ///
  734. template<typename BaseIterator>
  735. class boundary_point_index {
  736. public:
  737. ///
  738. /// The type of the iterator used to iterate over the original text
  739. ///
  740. typedef BaseIterator base_iterator;
  741. #ifdef BOOST_LOCALE_DOXYGEN
  742. ///
  743. /// The bidirectional iterator that iterates over \ref value_type objects.
  744. ///
  745. /// - The iterators may be invalidated by use of any non-const member function
  746. /// including but not limited to \ref rule(rule_type) member function.
  747. /// - The returned value_type object is valid %as long %as iterator points to it.
  748. /// So this following code is wrong %as t used after p was updated:
  749. /// \code
  750. /// boundary_point_index<some_iterator>::iterator p=index.begin();
  751. /// boundary_point<some_iterator> &t = *p;
  752. /// ++p;
  753. /// rule_type r = t->rule();
  754. /// \endcode
  755. ///
  756. typedef unspecified_iterator_type iterator;
  757. ///
  758. /// \copydoc iterator
  759. ///
  760. typedef unspecified_iterator_type const_iterator;
  761. #else
  762. typedef details::boundary_point_index_iterator<base_iterator> iterator;
  763. typedef details::boundary_point_index_iterator<base_iterator> const_iterator;
  764. #endif
  765. ///
  766. /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
  767. /// an object that represents the selected \ref boundary_point "boundary point".
  768. ///
  769. typedef boundary_point<base_iterator> value_type;
  770. ///
  771. /// Default constructor.
  772. ///
  773. /// \note
  774. ///
  775. /// When this object is constructed by default it does not include a valid index, thus
  776. /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
  777. /// behavior
  778. ///
  779. boundary_point_index() : mask_(0xFFFFFFFFu)
  780. {
  781. }
  782. ///
  783. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  784. /// in range [begin,end) using a rule \a mask for locale \a loc.
  785. ///
  786. boundary_point_index(boundary_type type,
  787. base_iterator begin,
  788. base_iterator end,
  789. rule_type mask,
  790. std::locale const &loc=std::locale())
  791. :
  792. map_(type,begin,end,loc),
  793. mask_(mask)
  794. {
  795. }
  796. ///
  797. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  798. /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc.
  799. ///
  800. boundary_point_index(boundary_type type,
  801. base_iterator begin,
  802. base_iterator end,
  803. std::locale const &loc=std::locale())
  804. :
  805. map_(type,begin,end,loc),
  806. mask_(0xFFFFFFFFu)
  807. {
  808. }
  809. ///
  810. /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information
  811. /// and uses the default rule (all possible %boundary points)
  812. ///
  813. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  814. /// range it is much better to create one from another rather then indexing the same
  815. /// range twice.
  816. ///
  817. /// \note \ref rule() flags are not copied
  818. ///
  819. boundary_point_index(segment_index<base_iterator> const &other);
  820. ///
  821. /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information
  822. /// and keeps the current \ref rule() unchanged
  823. ///
  824. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  825. /// range it is much better to create one from another rather then indexing the same
  826. /// range twice.
  827. ///
  828. /// \note \ref rule() flags are not copied
  829. ///
  830. boundary_point_index const &operator=(segment_index<base_iterator> const &other);
  831. ///
  832. /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
  833. /// in range [begin,end) for locale \a loc.
  834. ///
  835. /// \note \ref rule() remains unchanged.
  836. ///
  837. void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale())
  838. {
  839. map_ = mapping_type(type,begin,end,loc);
  840. }
  841. ///
  842. /// Get the \ref iterator on the beginning of the %boundary points range.
  843. ///
  844. /// Preconditions: this boundary_point_index should have a mapping
  845. ///
  846. /// \note
  847. ///
  848. /// The returned iterator is invalidated by access to any non-const member functions of this object
  849. ///
  850. iterator begin() const
  851. {
  852. return iterator(true,&map_,mask_);
  853. }
  854. ///
  855. /// Get the \ref iterator on the ending of the %boundary points range.
  856. ///
  857. /// Preconditions: this boundary_point_index should have a mapping
  858. ///
  859. /// \note
  860. ///
  861. /// The returned iterator is invalidated by access to any non-const member functions of this object
  862. ///
  863. iterator end() const
  864. {
  865. return iterator(false,&map_,mask_);
  866. }
  867. ///
  868. /// Find a first valid %boundary point on a position \a p or following it.
  869. ///
  870. /// For example: For \ref word %boundary analysis of the text "to be or"
  871. ///
  872. /// - "|to be", would return %boundary point at "|to be",
  873. /// - "t|o be", would point to "to| be"
  874. ///
  875. /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator
  876. /// to the text in the mapped range.
  877. ///
  878. /// The returned iterator is invalidated by access to any non-const member functions of this object
  879. ///
  880. iterator find(base_iterator p) const
  881. {
  882. return iterator(p,&map_,mask_);
  883. }
  884. ///
  885. /// Get the mask of rules that are used
  886. ///
  887. rule_type rule() const
  888. {
  889. return mask_;
  890. }
  891. ///
  892. /// Set the mask of rules that are used
  893. ///
  894. void rule(rule_type v)
  895. {
  896. mask_ = v;
  897. }
  898. private:
  899. friend class segment_index<base_iterator>;
  900. typedef details::mapping<base_iterator> mapping_type;
  901. mapping_type map_;
  902. rule_type mask_;
  903. };
  904. /// \cond INTERNAL
  905. template<typename BaseIterator>
  906. segment_index<BaseIterator>::segment_index(boundary_point_index<BaseIterator> const &other) :
  907. map_(other.map_),
  908. mask_(0xFFFFFFFFu),
  909. full_select_(false)
  910. {
  911. }
  912. template<typename BaseIterator>
  913. boundary_point_index<BaseIterator>::boundary_point_index(segment_index<BaseIterator> const &other) :
  914. map_(other.map_),
  915. mask_(0xFFFFFFFFu)
  916. {
  917. }
  918. template<typename BaseIterator>
  919. segment_index<BaseIterator> const &segment_index<BaseIterator>::operator=(boundary_point_index<BaseIterator> const &other)
  920. {
  921. map_ = other.map_;
  922. return *this;
  923. }
  924. template<typename BaseIterator>
  925. boundary_point_index<BaseIterator> const &boundary_point_index<BaseIterator>::operator=(segment_index<BaseIterator> const &other)
  926. {
  927. map_ = other.map_;
  928. return *this;
  929. }
  930. /// \endcond
  931. typedef segment_index<std::string::const_iterator> ssegment_index; ///< convenience typedef
  932. typedef segment_index<std::wstring::const_iterator> wssegment_index; ///< convenience typedef
  933. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  934. typedef segment_index<std::u16string::const_iterator> u16ssegment_index;///< convenience typedef
  935. #endif
  936. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  937. typedef segment_index<std::u32string::const_iterator> u32ssegment_index;///< convenience typedef
  938. #endif
  939. typedef segment_index<char const *> csegment_index; ///< convenience typedef
  940. typedef segment_index<wchar_t const *> wcsegment_index; ///< convenience typedef
  941. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  942. typedef segment_index<char16_t const *> u16csegment_index; ///< convenience typedef
  943. #endif
  944. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  945. typedef segment_index<char32_t const *> u32csegment_index; ///< convenience typedef
  946. #endif
  947. typedef boundary_point_index<std::string::const_iterator> sboundary_point_index;///< convenience typedef
  948. typedef boundary_point_index<std::wstring::const_iterator> wsboundary_point_index;///< convenience typedef
  949. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  950. typedef boundary_point_index<std::u16string::const_iterator> u16sboundary_point_index;///< convenience typedef
  951. #endif
  952. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  953. typedef boundary_point_index<std::u32string::const_iterator> u32sboundary_point_index;///< convenience typedef
  954. #endif
  955. typedef boundary_point_index<char const *> cboundary_point_index; ///< convenience typedef
  956. typedef boundary_point_index<wchar_t const *> wcboundary_point_index; ///< convenience typedef
  957. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  958. typedef boundary_point_index<char16_t const *> u16cboundary_point_index;///< convenience typedef
  959. #endif
  960. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  961. typedef boundary_point_index<char32_t const *> u32cboundary_point_index;///< convenience typedef
  962. #endif
  963. } // boundary
  964. } // locale
  965. } // boost
  966. ///
  967. /// \example boundary.cpp
  968. /// Example of using segment_index
  969. /// \example wboundary.cpp
  970. /// Example of using segment_index over wide strings
  971. ///
  972. #ifdef BOOST_MSVC
  973. #pragma warning(pop)
  974. #endif
  975. #endif
  976. // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4