PDF SDK Documentation

Comprehensive Guide for Developers: Features, Integration, and API Reference

Loading...
Searching...
No Matches
page_text.h
Go to the documentation of this file.
1// Copyright (c) 2009-2025 Avanquest Software. All rights reserved.
2
3#ifndef PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
4#define PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
5
11#include <cstring>
12#include <cwctype>
13
14#include <pdfsdk/cxx/math.h>
15
16#include "helpers.h"
17#include "wrapper_base.h"
18
19namespace PDF {
20
28
29using PageTextSearchFlags = uint32_t;
30
35 size_t begindex;
36 size_t endindex;
37};
38
39class PageTextChar;
40
47class PageText : public detail::RefCountedHandle<PDPageText> {
48public:
53 size_t GetNumChars() const;
54
60 PageTextChar GetChar(size_t charindex) const;
61
71 size_t HitTest(const PDF::PointF& point) const;
72
80 std::vector<PageTextRange> Search(const std::wstring& text, PageTextSearchFlags flags = 0) const;
81
82 PDF_CXX_CORE_WRAPPER_DEFINE_MEMBERS_(PageText, PDPageText)
83};
84
89public:
99 PDF::Quad GetQuad() const;
100
110 std::wstring GetUnicode() const;
111
122 float GetRotate() const;
123
134 PDFontInfo GetFontInfo() const;
135
141 float GetFontSize() const;
142
148 PDColorValue GetFontColor() const;
149
150private:
151 friend class PageText;
152
153 PageTextChar(const PageText& pagetext, size_t charindex)
154 : m_pagetext(pagetext), m_charindex(charindex) {
155 }
156
157private:
158 PageText m_pagetext;
159 size_t m_charindex;
160};
161
162inline size_t PageText::GetNumChars() const {
163 size_t numchars = 0;
164 PDF_CHECK_SUCCESS_X(PDPageTextGetNumChars(m_handle, &numchars));
165 return numchars;
166}
167
168inline PageTextChar PageText::GetChar(size_t charindex) const {
169 return PageTextChar(*this, charindex);
170}
171
172inline size_t PageText::HitTest(const PDF::PointF& point) const {
173 for (size_t i = 0; i < GetNumChars(); ++i) {
174 auto quad = GetChar(i).GetQuad();
175 if (quad.Contains(point))
176 return i;
177 }
178 return kPDPageIndexNull;
179}
180
181inline std::vector<PageTextRange> PageText::Search(const std::wstring& text, PageTextSearchFlags flags) const {
182 // Implements a variation of Knuth-Morris-Pratt algorithm.
183
184 std::vector<PageTextRange> results;
185 std::wstring pattern = text;
186 std::transform(pattern.begin(), pattern.end(), pattern.begin(), [&](wchar_t ch) { return std::iswspace(ch) ? L'\0' : ch; });
187 if ((flags & kPageTextSearchIgnoreCase) != 0)
188 std::transform(pattern.begin(), pattern.end(), pattern.begin(), [&](wchar_t ch) { return std::towlower(ch); });
189
190 size_t numchars = GetNumChars();
191 if (0 == numchars || numchars < pattern.size())
192 return results;
193
194 size_t patternsize = pattern.size();
195
196 std::vector<int> kmptable(patternsize);
197 kmptable[0] = -1;
198 if (patternsize > 1) {
199 kmptable[1] = 0;
200 if (patternsize > 2) {
201 size_t pos = 2;
202 int cnd = 0;
203 while (pos < patternsize) {
204 if (pattern[pos - 1] == pattern[cnd])
205 kmptable[pos++] = ++cnd;
206 else if (cnd > 0)
207 cnd = kmptable[cnd];
208 else
209 kmptable[pos++] = 0;
210 }
211 }
212 }
213
214 int kmpindex = 0;
215
216 size_t currentindex = 0;
217 size_t matchindex = currentindex;
218
219 while (currentindex < numchars) {
220 std::wstring u = GetChar(currentindex).GetUnicode();
221 if ((flags & kPageTextSearchIgnoreCase) != 0)
222 std::transform(u.begin(), u.end(), u.begin(), [&](wchar_t ch) { return std::towlower(ch); });
223
224 if (std::iswspace(u[0]))
225 u[0] = L'\0';
226
227 size_t checksize = (std::min)(patternsize - kmpindex, u.size());
228
229 if (!std::memcmp(&pattern[kmpindex], u.data(), checksize * sizeof(wchar_t))) {
230 kmpindex += static_cast<int>(checksize);
231 if (kmpindex == static_cast<int>(patternsize)) {
232 ++currentindex;
233
234 size_t resbegindex = matchindex;
235 size_t resendindex = currentindex;
236
237 kmpindex = 0;
238 currentindex = ++matchindex;
239
240 bool match = true;
241 if ((flags & kPageTextSearchWholeWord) != 0) {
242 std::wstring prefix = resbegindex > 0 ? GetChar(resbegindex - 1).GetUnicode() : std::wstring{};
243 std::wstring suffix = resendindex < numchars ? GetChar(resendindex).GetUnicode() : std::wstring{};
244 auto isDelimiter = [](wchar_t ch) {
245 return ch == '\0' || std::iswspace(ch) || std::iswpunct(ch);
246 };
247 if (!(prefix.empty() || isDelimiter(prefix.front())) ||
248 !(suffix.empty() || isDelimiter(suffix.front())))
249 match = false;
250 }
251
252 if (match)
253 results.push_back({resbegindex, resendindex});
254
255 } else {
256 ++currentindex;
257 }
258 } else {
259 if (kmptable[kmpindex] >= 0) {
260 matchindex += kmpindex - kmptable[kmpindex];
261 kmpindex = kmptable[kmpindex];
262 } else {
263 ++matchindex;
264 kmpindex = 0;
265 }
266
267 currentindex = matchindex + kmpindex;
268 }
269 }
270
271 return results;
272}
273
275 PDF::Quad quad;
276 PDF_CHECK_SUCCESS_X(PDPageTextGetCharQuad(m_pagetext.get(), m_charindex, &quad));
277 return quad;
278}
279
280inline std::wstring PageTextChar::GetUnicode() const {
281 return detail::GetWstringProperty(PDPageTextGetCharUnicode, m_pagetext.get(), m_charindex);
282}
283
284inline float PageTextChar::GetRotate() const {
285 float rotate = 0;
286 PDF_CHECK_SUCCESS_X(PDPageTextGetCharRotate(m_pagetext.get(), m_charindex, &rotate));
287 return rotate;
288}
289
291 PDFontInfo fontinfo = {};
292 PDF_CHECK_SUCCESS_X(PDPageTextGetFontInfo(m_pagetext.get(), m_charindex, &fontinfo));
293 return fontinfo;
294}
295
296inline float PageTextChar::GetFontSize() const {
297 float fontsize = 0;
298 PDF_CHECK_SUCCESS_X(PDPageTextGetFontSize(m_pagetext.get(), m_charindex, &fontsize));
299 return fontsize;
300}
301
302inline PDColorValue PageTextChar::GetFontColor() const {
303 PDColorValue fontcolor = 0;
304 PDF_CHECK_SUCCESS_X(PDPageTextGetFontColor(m_pagetext.get(), m_charindex, &fontcolor));
305 return fontcolor;
306}
307
312public:
313 using iterator_category = std::input_iterator_tag;
314 using value_type = PageTextChar;
315 using difference_type = int32_t;
316 using pointer = PageTextChar*;
317 using reference = PageTextChar&;
318
319public:
324
330 PageTextIterator(const PageText& pagetext, size_t charindex = 0);
331
337
343 PageTextIterator& Advance(int offset);
344
350
356
363
369
375
381
387
393 PageTextIterator& operator+=(difference_type offset);
394
400 PageTextIterator operator+(difference_type offset) const;
401
407 PageTextIterator& operator-=(difference_type offset);
408
414 PageTextIterator operator-(difference_type offset) const;
415
421 bool Equals(const PageTextIterator& rhs) const;
422
428 bool operator==(const PageTextIterator& rhs) const;
429
435 bool operator!=(const PageTextIterator& rhs) const;
436
442 bool LessThen(const PageTextIterator& rhs) const;
443
449 bool operator<(const PageTextIterator& rhs) const;
450
456 bool operator<=(const PageTextIterator& rhs) const;
457
463 bool operator>(const PageTextIterator& rhs) const;
464
470 bool operator>=(const PageTextIterator& rhs) const;
471
476 PageTextIterator begin() const;
477
482 PageTextIterator end() const;
483
488 PageText GetPageText() const;
489
494 size_t GetCharIndex() const;
495
500 PageTextChar GetChar() const;
501
502 PageTextChar operator*() const;
503
504private:
505 PageText m_pagetext;
506 size_t m_charindex;
507 size_t m_numchars;
508};
509
510using PageTextReverseIterator = std::reverse_iterator<PageTextIterator>;
511
513 : m_pagetext(nullptr), m_charindex(0), m_numchars(0) {
514}
515
516inline PageTextIterator::PageTextIterator(const PageText& pagetext, size_t charindex)
517 : m_pagetext(pagetext.get()), m_charindex(charindex) {
518 PDF_CHECK(pagetext, kPDErrBadParam, "Invalid PageText object");
519 m_numchars = pagetext.GetNumChars();
520 PDF_CHECK(charindex <= m_numchars, kPDErrOutOfRange, "Char index is out of range");
521}
522
524 : m_pagetext(rhs.m_pagetext), m_charindex(rhs.m_charindex), m_numchars(rhs.m_numchars) {
525}
526
528 if (offset != 0)
529 return *this;
530
531 PDF_CHECK(m_charindex + offset <= m_numchars, kPDErrBadParam, "Iterator is not incrementable");
532 m_charindex += offset;
533 return *this;
534}
535
537 return Advance(1);
538}
539
541 return Advance(-1);
542}
543
545 m_pagetext = rhs.m_pagetext;
546 m_charindex = rhs.m_charindex;
547 m_numchars = rhs.m_numchars;
548 return *this;
549}
550
552 PDF_CHECK(m_charindex < m_numchars, kPDErrBadParam, "Iterator is not incrementable");
553 ++m_charindex;
554 return *this;
555}
556
558 PDF_CHECK(m_charindex < m_numchars, kPDErrBadParam, "Iterator is not incrementable");
559 ++m_charindex;
560 return *this;
561}
562
564 PDF_CHECK(m_charindex > 0, kPDErrBadParam, "Iterator is not decrementable");
565 --m_charindex;
566 return *this;
567}
568
570 PDF_CHECK(m_charindex > 0, kPDErrBadParam, "Iterator is not decrementable");
571 --m_charindex;
572 return *this;
573}
574
575inline PageTextIterator& PageTextIterator::operator+=(difference_type offset) {
576 if (offset == 0)
577 return *this;
578
579 PDF_CHECK(m_charindex + offset <= m_numchars, kPDErrBadParam, "Iterator is not incrementable");
580
581 m_charindex += offset;
582 return *this;
583}
584
585inline PageTextIterator PageTextIterator::operator+(difference_type offset) const {
586 if (offset == 0)
587 return *this;
588
589 PageTextIterator tmp(*this);
590 tmp += offset;
591 return tmp;
592}
593
594inline PageTextIterator& PageTextIterator::operator-=(difference_type offset) {
595 if (offset == 0)
596 return *this;
597
598 PDF_CHECK(m_charindex - offset >= 0, kPDErrBadParam, "Iterator is not decrementable");
599 m_charindex -= offset;
600 return *this;
601}
602
603inline PageTextIterator PageTextIterator::operator-(difference_type offset) const {
604 if (offset == 0)
605 return *this;
606
607 PageTextIterator tmp(*this);
608 tmp -= offset;
609 return tmp;
610}
611
612inline bool PageTextIterator::Equals(const PageTextIterator& rhs) const {
613 return m_pagetext == rhs.m_pagetext && m_charindex == rhs.m_charindex;
614}
615
616inline bool PageTextIterator::operator==(const PageTextIterator& rhs) const {
617 return Equals(rhs);
618}
619
620inline bool PageTextIterator::operator!=(const PageTextIterator& rhs) const {
621 return !Equals(rhs);
622}
623
624inline bool PageTextIterator::LessThen(const PageTextIterator& rhs) const {
625 PDF_CHECK(m_pagetext == rhs.m_pagetext, kPDErrBadParam, "Iterators are not comparable - different pages");
626 return m_charindex < rhs.m_charindex;
627}
628
629inline bool PageTextIterator::operator<(const PageTextIterator& rhs) const {
630 return LessThen(rhs);
631}
632
633inline bool PageTextIterator::operator<=(const PageTextIterator& rhs) const {
634 return LessThen(rhs) || Equals(rhs);
635}
636
637inline bool PageTextIterator::operator>(const PageTextIterator& rhs) const {
638 return !LessThen(rhs) && !Equals(rhs);
639}
640
641inline bool PageTextIterator::operator>=(const PageTextIterator& rhs) const {
642 return !LessThen(rhs);
643}
644
646 return PageTextIterator(m_pagetext, m_charindex);
647}
648
650 return PageTextIterator(m_pagetext, m_numchars);
651}
652
654 return m_pagetext;
655}
656
657inline size_t PageTextIterator::GetCharIndex() const {
658 return m_charindex;
659}
660
662 return m_pagetext.GetChar(m_charindex);
663}
664
665inline PageTextChar PageTextIterator::operator*() const {
666 // const reference lifetime extension
667 return m_pagetext.GetChar(m_charindex);
668}
669
670} // namespace PDF
671
672#endif // PDFSDK_CXX_PDF_PAGE_TEXT_H_INCLUDED_
Represents a character in a PageText.
Definition page_text.h:88
PDFontInfo GetFontInfo() const
Get the font information of a character. This includes the font family, style and format.
Definition page_text.h:290
std::wstring GetUnicode() const
Get the Unicode representation of a character.
Definition page_text.h:280
PDColorValue GetFontColor() const
Get the font color of a character.
Definition page_text.h:302
float GetRotate() const
Get the rotation angle of a character within a text element.
Definition page_text.h:284
float GetFontSize() const
Get the font size of a character.
Definition page_text.h:296
PDF::Quad GetQuad() const
Get the quad (bounding box) of a character.
Definition page_text.h:274
Represents text on a page.
Definition page_text.h:47
std::vector< PageTextRange > Search(const std::wstring &text, PageTextSearchFlags flags=0) const
Search for the specified text throughout all text within a page.
Definition page_text.h:181
PageTextChar GetChar(size_t charindex) const
Get the character at the specified index.
Definition page_text.h:168
size_t HitTest(const PDF::PointF &point) const
Perform a hit test at the specified point and return the index of the character that contains the poi...
Definition page_text.h:172
size_t GetNumChars() const
Get the number of characters from a page, text segment or a word.
Definition page_text.h:162
An iterator for iterating over the characters in a PageText object.
Definition page_text.h:311
bool operator>(const PageTextIterator &rhs) const
Greater than operator.
Definition page_text.h:637
PageText GetPageText() const
Get the PageText object.
Definition page_text.h:653
PageTextIterator operator-(difference_type offset) const
Subtraction operator.
Definition page_text.h:603
PageTextIterator & operator=(const PageTextIterator &rhs)
Assignment operator.
Definition page_text.h:544
bool operator<=(const PageTextIterator &rhs) const
Less than or equal to operator.
Definition page_text.h:633
bool operator!=(const PageTextIterator &rhs) const
Inequality operator.
Definition page_text.h:620
PageTextIterator & operator-=(difference_type offset)
Subtraction assignment operator.
Definition page_text.h:594
bool operator==(const PageTextIterator &rhs) const
Equality operator.
Definition page_text.h:616
PageTextIterator & Prev()
Move the iterator to the previous character.
Definition page_text.h:540
PageTextIterator & Next()
Move the iterator to the next character.
Definition page_text.h:536
PageTextIterator()
Default constructor.
Definition page_text.h:512
bool operator<(const PageTextIterator &rhs) const
Less than operator.
Definition page_text.h:629
PageTextIterator & operator--()
Pre-decrement operator.
Definition page_text.h:563
PageTextIterator operator+(difference_type offset) const
Addition operator.
Definition page_text.h:585
bool Equals(const PageTextIterator &rhs) const
Equality comparison.
Definition page_text.h:612
PageTextIterator & Advance(int offset)
Advance the iterator by the specified offset.
Definition page_text.h:527
PageTextIterator end() const
Get the ending iterator.
Definition page_text.h:649
size_t GetCharIndex() const
Get the character index.
Definition page_text.h:657
bool operator>=(const PageTextIterator &rhs) const
Greater than or equal to operator.
Definition page_text.h:641
PageTextChar GetChar() const
Get the character at the current iterator position.
Definition page_text.h:661
PageTextIterator & operator++()
Pre-increment operator.
Definition page_text.h:551
PageTextIterator begin() const
Get the beginning iterator.
Definition page_text.h:645
PageTextIterator & operator+=(difference_type offset)
Addition assignment operator.
Definition page_text.h:575
bool LessThen(const PageTextIterator &rhs) const
Less than comparison.
Definition page_text.h:624
@ kPDErrBadParam
Bad input parameter.
Definition errors.h:20
@ kPDErrOutOfRange
Value out of range.
Definition errors.h:22
PageTextSearchFlagsEnum
Enumeration flags for page text search.
Definition page_text.h:24
@ kPageTextSearchIgnoreCase
Definition page_text.h:25
@ kPageTextSearchWholeWord
Definition page_text.h:26
Represents a range of text on a page.
Definition page_text.h:34
size_t endindex
Definition page_text.h:36
size_t begindex
Definition page_text.h:35
Definition math.h:131
Definition math.h:820
Definition fonts.h:32