3#ifndef PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
4#define PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
14#include <pdfsdk/cxx/math.h>
17#include "wrapper_base.h"
29using PageTextSearchFlags = uint32_t;
47class PageText :
public detail::RefCountedHandle<PDPageText> {
80 std::vector<PageTextRange>
Search(
const std::wstring& text, PageTextSearchFlags flags = 0)
const;
82 PDF_CXX_CORE_WRAPPER_DEFINE_MEMBERS_(
PageText, PDPageText)
154 : m_pagetext(pagetext), m_charindex(charindex) {
164 PDF_CHECK_SUCCESS_X(PDPageTextGetNumChars(m_handle, &numchars));
175 if (quad.Contains(point))
178 return kPDPageIndexNull;
181inline std::vector<PageTextRange>
PageText::Search(
const std::wstring& text, PageTextSearchFlags flags)
const {
184 std::vector<PageTextRange> results;
185 std::wstring pattern = text;
186 std::transform(pattern.begin(), pattern.end(), pattern.begin(), [&](
wchar_t ch) { return std::iswspace(ch) ? L
'\0' : ch; });
188 std::transform(pattern.begin(), pattern.end(), pattern.begin(), [&](
wchar_t ch) { return std::towlower(ch); });
191 if (0 == numchars || numchars < pattern.size())
194 size_t patternsize = pattern.size();
196 std::vector<int> kmptable(patternsize);
198 if (patternsize > 1) {
200 if (patternsize > 2) {
203 while (pos < patternsize) {
204 if (pattern[pos - 1] == pattern[cnd])
205 kmptable[pos++] = ++cnd;
216 size_t currentindex = 0;
217 size_t matchindex = currentindex;
219 while (currentindex < numchars) {
222 std::transform(u.begin(), u.end(), u.begin(), [&](
wchar_t ch) { return std::towlower(ch); });
224 if (std::iswspace(u[0]))
227 size_t checksize = (std::min)(patternsize - kmpindex, u.size());
229 if (!std::memcmp(&pattern[kmpindex], u.data(), checksize *
sizeof(
wchar_t))) {
230 kmpindex +=
static_cast<int>(checksize);
231 if (kmpindex ==
static_cast<int>(patternsize)) {
234 size_t resbegindex = matchindex;
235 size_t resendindex = currentindex;
238 currentindex = ++matchindex;
242 std::wstring prefix = resbegindex > 0 ?
GetChar(resbegindex - 1).
GetUnicode() : std::wstring{};
243 std::wstring suffix = resendindex < numchars ?
GetChar(resendindex).
GetUnicode() : std::wstring{};
244 auto isDelimiter = [](
wchar_t ch) {
245 return ch ==
'\0' || std::iswspace(ch) || std::iswpunct(ch);
247 if (!(prefix.empty() || isDelimiter(prefix.front())) ||
248 !(suffix.empty() || isDelimiter(suffix.front())))
253 results.push_back({resbegindex, resendindex});
259 if (kmptable[kmpindex] >= 0) {
260 matchindex += kmpindex - kmptable[kmpindex];
261 kmpindex = kmptable[kmpindex];
267 currentindex = matchindex + kmpindex;
276 PDF_CHECK_SUCCESS_X(PDPageTextGetCharQuad(m_pagetext.get(), m_charindex, &quad));
281 return detail::GetWstringProperty(PDPageTextGetCharUnicode, m_pagetext.get(), m_charindex);
286 PDF_CHECK_SUCCESS_X(PDPageTextGetCharRotate(m_pagetext.get(), m_charindex, &rotate));
292 PDF_CHECK_SUCCESS_X(PDPageTextGetFontInfo(m_pagetext.get(), m_charindex, &fontinfo));
298 PDF_CHECK_SUCCESS_X(PDPageTextGetFontSize(m_pagetext.get(), m_charindex, &fontsize));
303 PDColorValue fontcolor = 0;
304 PDF_CHECK_SUCCESS_X(PDPageTextGetFontColor(m_pagetext.get(), m_charindex, &fontcolor));
313 using iterator_category = std::input_iterator_tag;
315 using difference_type = int32_t;
510using PageTextReverseIterator = std::reverse_iterator<PageTextIterator>;
513 : m_pagetext(nullptr), m_charindex(0), m_numchars(0) {
517 : m_pagetext(pagetext.get()), m_charindex(charindex) {
520 PDF_CHECK(charindex <= m_numchars,
kPDErrOutOfRange,
"Char index is out of range");
524 : m_pagetext(rhs.m_pagetext), m_charindex(rhs.m_charindex), m_numchars(rhs.m_numchars) {
531 PDF_CHECK(m_charindex + offset <= m_numchars,
kPDErrBadParam,
"Iterator is not incrementable");
532 m_charindex += offset;
545 m_pagetext = rhs.m_pagetext;
546 m_charindex = rhs.m_charindex;
547 m_numchars = rhs.m_numchars;
552 PDF_CHECK(m_charindex < m_numchars,
kPDErrBadParam,
"Iterator is not incrementable");
558 PDF_CHECK(m_charindex < m_numchars,
kPDErrBadParam,
"Iterator is not incrementable");
564 PDF_CHECK(m_charindex > 0,
kPDErrBadParam,
"Iterator is not decrementable");
570 PDF_CHECK(m_charindex > 0,
kPDErrBadParam,
"Iterator is not decrementable");
579 PDF_CHECK(m_charindex + offset <= m_numchars,
kPDErrBadParam,
"Iterator is not incrementable");
581 m_charindex += offset;
598 PDF_CHECK(m_charindex - offset >= 0,
kPDErrBadParam,
"Iterator is not decrementable");
599 m_charindex -= offset;
613 return m_pagetext == rhs.m_pagetext && m_charindex == rhs.m_charindex;
625 PDF_CHECK(m_pagetext == rhs.m_pagetext,
kPDErrBadParam,
"Iterators are not comparable - different pages");
626 return m_charindex < rhs.m_charindex;
662 return m_pagetext.
GetChar(m_charindex);
665inline PageTextChar PageTextIterator::operator*()
const {
667 return m_pagetext.
GetChar(m_charindex);
Represents a character in a PageText.
Definition page_text.h:88
PDFontInfo GetFontInfo() const
Get the font information of a character. This includes the font family, style and format.
Definition page_text.h:290
std::wstring GetUnicode() const
Get the Unicode representation of a character.
Definition page_text.h:280
PDColorValue GetFontColor() const
Get the font color of a character.
Definition page_text.h:302
float GetRotate() const
Get the rotation angle of a character within a text element.
Definition page_text.h:284
float GetFontSize() const
Get the font size of a character.
Definition page_text.h:296
PDF::Quad GetQuad() const
Get the quad (bounding box) of a character.
Definition page_text.h:274
Represents text on a page.
Definition page_text.h:47
std::vector< PageTextRange > Search(const std::wstring &text, PageTextSearchFlags flags=0) const
Search for the specified text throughout all text within a page.
Definition page_text.h:181
PageTextChar GetChar(size_t charindex) const
Get the character at the specified index.
Definition page_text.h:168
size_t HitTest(const PDF::PointF &point) const
Perform a hit test at the specified point and return the index of the character that contains the poi...
Definition page_text.h:172
size_t GetNumChars() const
Get the number of characters from a page, text segment or a word.
Definition page_text.h:162
An iterator for iterating over the characters in a PageText object.
Definition page_text.h:311
bool operator>(const PageTextIterator &rhs) const
Greater than operator.
Definition page_text.h:637
PageText GetPageText() const
Get the PageText object.
Definition page_text.h:653
PageTextIterator operator-(difference_type offset) const
Subtraction operator.
Definition page_text.h:603
PageTextIterator & operator=(const PageTextIterator &rhs)
Assignment operator.
Definition page_text.h:544
bool operator<=(const PageTextIterator &rhs) const
Less than or equal to operator.
Definition page_text.h:633
bool operator!=(const PageTextIterator &rhs) const
Inequality operator.
Definition page_text.h:620
PageTextIterator & operator-=(difference_type offset)
Subtraction assignment operator.
Definition page_text.h:594
bool operator==(const PageTextIterator &rhs) const
Equality operator.
Definition page_text.h:616
PageTextIterator & Prev()
Move the iterator to the previous character.
Definition page_text.h:540
PageTextIterator & Next()
Move the iterator to the next character.
Definition page_text.h:536
PageTextIterator()
Default constructor.
Definition page_text.h:512
bool operator<(const PageTextIterator &rhs) const
Less than operator.
Definition page_text.h:629
PageTextIterator & operator--()
Pre-decrement operator.
Definition page_text.h:563
PageTextIterator operator+(difference_type offset) const
Addition operator.
Definition page_text.h:585
bool Equals(const PageTextIterator &rhs) const
Equality comparison.
Definition page_text.h:612
PageTextIterator & Advance(int offset)
Advance the iterator by the specified offset.
Definition page_text.h:527
PageTextIterator end() const
Get the ending iterator.
Definition page_text.h:649
size_t GetCharIndex() const
Get the character index.
Definition page_text.h:657
bool operator>=(const PageTextIterator &rhs) const
Greater than or equal to operator.
Definition page_text.h:641
PageTextChar GetChar() const
Get the character at the current iterator position.
Definition page_text.h:661
PageTextIterator & operator++()
Pre-increment operator.
Definition page_text.h:551
PageTextIterator begin() const
Get the beginning iterator.
Definition page_text.h:645
PageTextIterator & operator+=(difference_type offset)
Addition assignment operator.
Definition page_text.h:575
bool LessThen(const PageTextIterator &rhs) const
Less than comparison.
Definition page_text.h:624
@ kPDErrBadParam
Bad input parameter.
Definition errors.h:20
@ kPDErrOutOfRange
Value out of range.
Definition errors.h:22
PageTextSearchFlagsEnum
Enumeration flags for page text search.
Definition page_text.h:24
@ kPageTextSearchIgnoreCase
Definition page_text.h:25
@ kPageTextSearchWholeWord
Definition page_text.h:26
Represents a range of text on a page.
Definition page_text.h:34
size_t endindex
Definition page_text.h:36
size_t begindex
Definition page_text.h:35