libpqxx
The C++ client library for PostgreSQL
encodings.hxx
1
9#ifndef PQXX_H_ENCODINGS
10#define PQXX_H_ENCODINGS
11
12#include <iomanip>
13#include <string>
14#include <string_view>
15
16#include "pqxx/internal/concat.hxx"
17#include "pqxx/internal/encoding_group.hxx"
18
19
20namespace pqxx
21{
22PQXX_DECLARE_ENUM_CONVERSION(pqxx::internal::encoding_group);
23} // namespace pqxx
24
25
26namespace pqxx::internal
27{
29PQXX_PURE char const *name_encoding(int encoding_id);
30
32PQXX_LIBEXPORT encoding_group enc_group(int /* libpq encoding ID */);
33
34
36
40PQXX_LIBEXPORT glyph_scanner_func *get_glyph_scanner(encoding_group);
41
42
43// TODO: Get rid of this one. Use compile-time-specialised version instead.
45
51template<char... NEEDLE>
52inline std::size_t find_char(
53 glyph_scanner_func *scanner, std::string_view haystack,
54 std::size_t here = 0u)
55{
56 auto const sz{std::size(haystack)};
57 auto const data{std::data(haystack)};
58 while (here < sz)
59 {
60 auto next{scanner(data, sz, here)};
61 PQXX_ASSUME(next > here);
62 // (For some reason gcc had a problem with a right-fold here. But clang
63 // was fine.)
64 if ((... or (data[here] == NEEDLE)))
65 {
66 // Also check against a multibyte character starting with a bytes which
67 // just happens to match one of the ASCII bytes we're looking for. It'd
68 // be cleaner to check that first, but either works. So, let's apply the
69 // most selective filter first and skip this check in almost all cases.
70 if (next == here + 1)
71 return here;
72 }
73
74 // Nope, no hit. Move on.
75 here = next;
76 }
77 return sz;
78}
79
80
81// TODO: Get rid of this one. Use compile-time-specialised loop instead.
83
86template<typename CALLABLE>
87inline void for_glyphs(
88 encoding_group enc, CALLABLE callback, char const buffer[],
89 std::size_t buffer_len, std::size_t start = 0)
90{
91 auto const scan{get_glyph_scanner(enc)};
92 for (std::size_t here = start, next; here < buffer_len; here = next)
93 {
94 next = scan(buffer, buffer_len, here);
95 PQXX_ASSUME(next > here);
96 callback(buffer + here, buffer + next);
97 }
98}
99
100
101namespace
102{
104constexpr PQXX_PURE unsigned char
105get_byte(char const buffer[], std::size_t offset) noexcept
106{
107 return static_cast<unsigned char>(buffer[offset]);
108}
109
110
111[[noreturn]] PQXX_COLD void throw_for_encoding_error(
112 char const *encoding_name, char const buffer[], std::size_t start,
113 std::size_t count)
114{
115 std::stringstream s;
116 s << "Invalid byte sequence for encoding " << encoding_name << " at byte "
117 << start << ": " << std::hex << std::setw(2) << std::setfill('0');
118 for (std::size_t i{0}; i < count; ++i)
119 {
120 s << "0x" << static_cast<unsigned int>(get_byte(buffer, start + i));
121 if (i + 1 < count)
122 s << " ";
123 }
124 throw pqxx::argument_error{s.str()};
125}
126
127
129constexpr PQXX_PURE bool
130between_inc(unsigned char value, unsigned bottom, unsigned top)
131{
132 return value >= bottom and value <= top;
133}
134} // namespace
135
136
138
142template<encoding_group> struct glyph_scanner
143{
144 // TODO: Convert to use string_view?
146 PQXX_PURE static std::size_t
147 call(char const buffer[], std::size_t buffer_len, std::size_t start);
148};
149
150
151namespace
152{
154
160template<encoding_group ENC, char... NEEDLE>
161PQXX_PURE inline std::size_t
162find_ascii_char(std::string_view haystack, std::size_t here)
163{
164 // We only know how to search for ASCII characters. It's an optimisation
165 // assumption in the code below.
166 static_assert((... and ((NEEDLE & 0x80) == 0)));
167
168 auto const sz{std::size(haystack)};
169 auto const data{std::data(haystack)};
170 while (here < sz)
171 {
172 // Look up the next character boundary. This can be quite costly, so we
173 // desperately want the call inlined.
174 auto next{glyph_scanner<ENC>::call(data, sz, here)};
175 PQXX_ASSUME(next > here);
176
177 // (For some reason gcc had a problem with a right-fold here. But clang
178 // was fine.)
179 //
180 // In all supported encodings, if a character's first byte is in the ASCII
181 // range, that means it's a single-byte character. It follows that when we
182 // find a match, we do not need to check that we're in a single-byte
183 // character:
184 //
185 // If this is an "ASCII-unsafe" encoding, e.g. SJIS, we're only checking
186 // each character's first byte. That first byte can only match NEEDLE if
187 // it's a single-byte character.
188 //
189 // In an "ASCII-safe" encoding, e.g. UTF-8 or the ISO-8859 ones, we check
190 // for a match at each byte in the text, because it's faster than finding
191 // character boundaries first. But in these encodings, a multichar byte
192 // never contains any bytes in the ASCII range at all.
193 if ((... or (data[here] == NEEDLE)))
194 return here;
195
196 // Nope, no hit. Move on.
197 here = next;
198 }
199 return sz;
200}
201} // namespace
202
203
205
209template<encoding_group ENC, char... NEEDLE>
210PQXX_PURE std::size_t
211find_s_ascii_char(std::string_view haystack, std::size_t here)
212{
213 // We only know how to search for ASCII characters. It's an optimisation
214 // assumption in the code below.
215 static_assert((... and ((NEEDLE >> 7) == 0)));
216
217 auto const sz{std::size(haystack)};
218 auto const data{std::data(haystack)};
219
220 // No supported encoding has multibyte characters that start with an
221 // ASCII-range byte.
222 while ((... and (data[here] != NEEDLE)))
223 {
224 auto const next = glyph_scanner<ENC>::call(data, sz, here);
225 PQXX_ASSUME(next > here);
226 here = next;
227 }
228 return here;
229}
230
231
232template<> struct glyph_scanner<encoding_group::MONOBYTE>
233{
234 static PQXX_PURE constexpr std::size_t
235 call(char const /* buffer */[], std::size_t buffer_len, std::size_t start)
236 {
237 // TODO: Don't bother with npos. Let the caller check.
238 if (start >= buffer_len)
239 PQXX_UNLIKELY return std::string::npos;
240 else
241 return start + 1;
242 }
243};
244
245
246// https://en.wikipedia.org/wiki/Big5#Organization
247template<> struct glyph_scanner<encoding_group::BIG5>
248{
249 static PQXX_PURE std::size_t
250 call(char const buffer[], std::size_t buffer_len, std::size_t start)
251 {
252 if (start >= buffer_len)
253 PQXX_UNLIKELY return std::string::npos;
254
255 auto const byte1{get_byte(buffer, start)};
256 if (byte1 < 0x80)
257 return start + 1;
258
259 if (not between_inc(byte1, 0x81, 0xfe) or (start + 2 > buffer_len))
260 PQXX_UNLIKELY
261 throw_for_encoding_error("BIG5", buffer, start, 1);
262
263 auto const byte2{get_byte(buffer, start + 1)};
264 if (
265 not between_inc(byte2, 0x40, 0x7e) and
266 not between_inc(byte2, 0xa1, 0xfe))
267 PQXX_UNLIKELY
268 throw_for_encoding_error("BIG5", buffer, start, 2);
269
270 return start + 2;
271 }
272};
273
274
275/*
276The PostgreSQL documentation claims that the EUC_* encodings are 1-3 bytes
277each, but other documents explain that the EUC sets can contain 1-(2,3,4) bytes
278depending on the specific extension:
279 EUC_CN : 1-2
280 EUC_JP : 1-3
281 EUC_JIS_2004: 1-2
282 EUC_KR : 1-2
283 EUC_TW : 1-4
284*/
285
286// https://en.wikipedia.org/wiki/GB_2312#EUC-CN
287template<> struct glyph_scanner<encoding_group::EUC_CN>
288{
289 static PQXX_PURE std::size_t
290 call(char const buffer[], std::size_t buffer_len, std::size_t start)
291 {
292 if (start >= buffer_len)
293 return std::string::npos;
294
295 auto const byte1{get_byte(buffer, start)};
296 if (byte1 < 0x80)
297 return start + 1;
298
299 if (not between_inc(byte1, 0xa1, 0xf7) or start + 2 > buffer_len)
300 PQXX_UNLIKELY
301 throw_for_encoding_error("EUC_CN", buffer, start, 1);
302
303 auto const byte2{get_byte(buffer, start + 1)};
304 if (not between_inc(byte2, 0xa1, 0xfe))
305 PQXX_UNLIKELY
306 throw_for_encoding_error("EUC_CN", buffer, start, 2);
307
308 return start + 2;
309 }
310};
311
312
313// EUC-JP and EUC-JIS-2004 represent slightly different code points but iterate
314// the same:
315//
316// https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-JP
317// http://x0213.org/codetable/index.en.html
318template<> struct glyph_scanner<encoding_group::EUC_JP>
319{
320 static PQXX_PURE std::size_t
321 call(char const buffer[], std::size_t buffer_len, std::size_t start)
322 {
323 if (start >= buffer_len)
324 return std::string::npos;
325
326 auto const byte1{get_byte(buffer, start)};
327 if (byte1 < 0x80)
328 return start + 1;
329
330 if (start + 2 > buffer_len)
331 PQXX_UNLIKELY
332 throw_for_encoding_error("EUC_JP", buffer, start, 1);
333
334 auto const byte2{get_byte(buffer, start + 1)};
335 if (byte1 == 0x8e)
336 {
337 if (not between_inc(byte2, 0xa1, 0xfe))
338 PQXX_UNLIKELY
339 throw_for_encoding_error("EUC_JP", buffer, start, 2);
340
341 return start + 2;
342 }
343
344 if (between_inc(byte1, 0xa1, 0xfe))
345 {
346 if (not between_inc(byte2, 0xa1, 0xfe))
347 PQXX_UNLIKELY
348 throw_for_encoding_error("EUC_JP", buffer, start, 2);
349
350 return start + 2;
351 }
352
353 if (byte1 == 0x8f and start + 3 <= buffer_len)
354 {
355 auto const byte3{get_byte(buffer, start + 2)};
356 if (
357 not between_inc(byte2, 0xa1, 0xfe) or
358 not between_inc(byte3, 0xa1, 0xfe))
359 PQXX_UNLIKELY
360 throw_for_encoding_error("EUC_JP", buffer, start, 3);
361
362 return start + 3;
363 }
364
365 throw_for_encoding_error("EUC_JP", buffer, start, 1);
366 }
367};
368
369
370// https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-KR
371template<> struct glyph_scanner<encoding_group::EUC_KR>
372{
373 static PQXX_PURE std::size_t
374 call(char const buffer[], std::size_t buffer_len, std::size_t start)
375 {
376 if (start >= buffer_len)
377 PQXX_UNLIKELY return std::string::npos;
378
379 auto const byte1{get_byte(buffer, start)};
380 if (byte1 < 0x80)
381 return start + 1;
382
383 if (not between_inc(byte1, 0xa1, 0xfe) or start + 2 > buffer_len)
384 PQXX_UNLIKELY
385 throw_for_encoding_error("EUC_KR", buffer, start, 1);
386
387 auto const byte2{get_byte(buffer, start + 1)};
388 if (not between_inc(byte2, 0xa1, 0xfe))
389 PQXX_UNLIKELY
390 throw_for_encoding_error("EUC_KR", buffer, start, 1);
391
392 return start + 2;
393 }
394};
395
396
397// https://en.wikipedia.org/wiki/Extended_Unix_Code#EUC-TW
398template<> struct glyph_scanner<encoding_group::EUC_TW>
399{
400 static PQXX_PURE std::size_t
401 call(char const buffer[], std::size_t buffer_len, std::size_t start)
402 {
403 if (start >= buffer_len)
404 PQXX_UNLIKELY
405 return std::string::npos;
406
407 auto const byte1{get_byte(buffer, start)};
408 if (byte1 < 0x80)
409 return start + 1;
410
411 if (start + 2 > buffer_len)
412 PQXX_UNLIKELY
413 throw_for_encoding_error("EUC_KR", buffer, start, 1);
414
415 auto const byte2{get_byte(buffer, start + 1)};
416 if (between_inc(byte1, 0xa1, 0xfe))
417 {
418 if (not between_inc(byte2, 0xa1, 0xfe))
419 PQXX_UNLIKELY
420 throw_for_encoding_error("EUC_KR", buffer, start, 2);
421
422 return start + 2;
423 }
424
425 if (byte1 != 0x8e or start + 4 > buffer_len)
426 PQXX_UNLIKELY
427 throw_for_encoding_error("EUC_KR", buffer, start, 1);
428
429 if (
430 between_inc(byte2, 0xa1, 0xb0) and
431 between_inc(get_byte(buffer, start + 2), 0xa1, 0xfe) and
432 between_inc(get_byte(buffer, start + 3), 0xa1, 0xfe))
433 return start + 4;
434
435 PQXX_UNLIKELY
436 throw_for_encoding_error("EUC_KR", buffer, start, 4);
437 }
438};
439
440
441// https://en.wikipedia.org/wiki/GB_18030#Mapping
442template<> struct glyph_scanner<encoding_group::GB18030>
443{
444 static PQXX_PURE std::size_t
445 call(char const buffer[], std::size_t buffer_len, std::size_t start)
446 {
447 if (start >= buffer_len)
448 PQXX_UNLIKELY return std::string::npos;
449
450 auto const byte1{get_byte(buffer, start)};
451 if (byte1 < 0x80)
452 return start + 1;
453 if (byte1 == 0x80)
454 throw_for_encoding_error("GB18030", buffer, start, buffer_len - start);
455
456 if (start + 2 > buffer_len)
457 PQXX_UNLIKELY
458 throw_for_encoding_error("GB18030", buffer, start, buffer_len - start);
459
460 auto const byte2{get_byte(buffer, start + 1)};
461 if (between_inc(byte2, 0x40, 0xfe))
462 {
463 if (byte2 == 0x7f)
464 PQXX_UNLIKELY
465 throw_for_encoding_error("GB18030", buffer, start, 2);
466
467 return start + 2;
468 }
469
470 if (start + 4 > buffer_len)
471 PQXX_UNLIKELY
472 throw_for_encoding_error("GB18030", buffer, start, buffer_len - start);
473
474 if (
475 between_inc(byte2, 0x30, 0x39) and
476 between_inc(get_byte(buffer, start + 2), 0x81, 0xfe) and
477 between_inc(get_byte(buffer, start + 3), 0x30, 0x39))
478 return start + 4;
479
480 PQXX_UNLIKELY
481 throw_for_encoding_error("GB18030", buffer, start, 4);
482 }
483};
484
485
486// https://en.wikipedia.org/wiki/GBK_(character_encoding)#Encoding
487template<> struct glyph_scanner<encoding_group::GBK>
488{
489 static PQXX_PURE std::size_t
490 call(char const buffer[], std::size_t buffer_len, std::size_t start)
491 {
492 if (start >= buffer_len)
493 PQXX_UNLIKELY return std::string::npos;
494
495 auto const byte1{get_byte(buffer, start)};
496 if (byte1 < 0x80)
497 return start + 1;
498
499 if (start + 2 > buffer_len)
500 PQXX_UNLIKELY
501 throw_for_encoding_error("GBK", buffer, start, 1);
502
503 auto const byte2{get_byte(buffer, start + 1)};
504 if (
505 (between_inc(byte1, 0xa1, 0xa9) and between_inc(byte2, 0xa1, 0xfe)) or
506 (between_inc(byte1, 0xb0, 0xf7) and between_inc(byte2, 0xa1, 0xfe)) or
507 (between_inc(byte1, 0x81, 0xa0) and between_inc(byte2, 0x40, 0xfe) and
508 byte2 != 0x7f) or
509 (between_inc(byte1, 0xaa, 0xfe) and between_inc(byte2, 0x40, 0xa0) and
510 byte2 != 0x7f) or
511 (between_inc(byte1, 0xa8, 0xa9) and between_inc(byte2, 0x40, 0xa0) and
512 byte2 != 0x7f) or
513 (between_inc(byte1, 0xaa, 0xaf) and between_inc(byte2, 0xa1, 0xfe)) or
514 (between_inc(byte1, 0xf8, 0xfe) and between_inc(byte2, 0xa1, 0xfe)) or
515 (between_inc(byte1, 0xa1, 0xa7) and between_inc(byte2, 0x40, 0xa0) and
516 byte2 != 0x7f))
517 return start + 2;
518
519 PQXX_UNLIKELY
520 throw_for_encoding_error("GBK", buffer, start, 2);
521 }
522};
523
524
525/*
526The PostgreSQL documentation claims that the JOHAB encoding is 1-3 bytes, but
527"CJKV Information Processing" describes it (actually just the Hangul portion)
528as "three five-bit segments" that reside inside 16 bits (2 bytes).
529
530CJKV Information Processing by Ken Lunde, pg. 269:
531
532 https://bit.ly/2BEOu5V
533*/
534template<> struct glyph_scanner<encoding_group::JOHAB>
535{
536 static PQXX_PURE std::size_t
537 call(char const buffer[], std::size_t buffer_len, std::size_t start)
538 {
539 if (start >= buffer_len)
540 PQXX_UNLIKELY return std::string::npos;
541
542 auto const byte1{get_byte(buffer, start)};
543 if (byte1 < 0x80)
544 return start + 1;
545
546 if (start + 2 > buffer_len)
547 PQXX_UNLIKELY
548 throw_for_encoding_error("JOHAB", buffer, start, 1);
549
550 auto const byte2{get_byte(buffer, start)};
551 if (
552 (between_inc(byte1, 0x84, 0xd3) and
553 (between_inc(byte2, 0x41, 0x7e) or between_inc(byte2, 0x81, 0xfe))) or
554 ((between_inc(byte1, 0xd8, 0xde) or between_inc(byte1, 0xe0, 0xf9)) and
555 (between_inc(byte2, 0x31, 0x7e) or between_inc(byte2, 0x91, 0xfe))))
556 return start + 2;
557
558 PQXX_UNLIKELY
559 throw_for_encoding_error("JOHAB", buffer, start, 2);
560 }
561};
562
563
564/*
565PostgreSQL's MULE_INTERNAL is the emacs rather than Xemacs implementation;
566see the server/mb/pg_wchar.h PostgreSQL header file.
567This is implemented according to the description in said header file, but I was
568unable to get it to successfully iterate a MULE-encoded test CSV generated
569using PostgreSQL 9.2.23. Use this at your own risk.
570*/
571template<> struct glyph_scanner<encoding_group::MULE_INTERNAL>
572{
573 static PQXX_PURE std::size_t
574 call(char const buffer[], std::size_t buffer_len, std::size_t start)
575 {
576 if (start >= buffer_len)
577 PQXX_UNLIKELY return std::string::npos;
578
579 auto const byte1{get_byte(buffer, start)};
580 if (byte1 < 0x80)
581 return start + 1;
582
583 if (start + 2 > buffer_len)
584 PQXX_UNLIKELY
585 throw_for_encoding_error("MULE_INTERNAL", buffer, start, 1);
586
587 auto const byte2{get_byte(buffer, start + 1)};
588 if (between_inc(byte1, 0x81, 0x8d) and byte2 >= 0xa0)
589 return start + 2;
590
591 if (start + 3 > buffer_len)
592 PQXX_UNLIKELY
593 throw_for_encoding_error("MULE_INTERNAL", buffer, start, 2);
594
595 if (
596 ((byte1 == 0x9a and between_inc(byte2, 0xa0, 0xdf)) or
597 (byte1 == 0x9b and between_inc(byte2, 0xe0, 0xef)) or
598 (between_inc(byte1, 0x90, 0x99) and byte2 >= 0xa0)) and
599 (byte2 >= 0xa0))
600 return start + 3;
601
602 if (start + 4 > buffer_len)
603 PQXX_UNLIKELY
604 throw_for_encoding_error("MULE_INTERNAL", buffer, start, 3);
605
606 if (
607 ((byte1 == 0x9c and between_inc(byte2, 0xf0, 0xf4)) or
608 (byte1 == 0x9d and between_inc(byte2, 0xf5, 0xfe))) and
609 get_byte(buffer, start + 2) >= 0xa0 and
610 get_byte(buffer, start + 4) >= 0xa0)
611 return start + 4;
612
613 PQXX_UNLIKELY
614 throw_for_encoding_error("MULE_INTERNAL", buffer, start, 4);
615 }
616};
617
618
619// As far as I can tell, for the purposes of iterating the only difference
620// between SJIS and SJIS-2004 is increased range in the first byte of two-byte
621// sequences (0xEF increased to 0xFC). Officially, that is; apparently the
622// version of SJIS used by Postgres has the same range as SJIS-2004. They both
623// have increased range over the documented versions, not having the even/odd
624// restriction for the first byte in 2-byte sequences.
625//
626// https://en.wikipedia.org/wiki/Shift_JIS#Shift_JIS_byte_map
627// http://x0213.org/codetable/index.en.html
628template<> struct glyph_scanner<encoding_group::SJIS>
629{
630 static PQXX_PURE std::size_t
631 call(char const buffer[], std::size_t buffer_len, std::size_t start)
632 {
633 if (start >= buffer_len)
634 return std::string::npos;
635
636 auto const byte1{get_byte(buffer, start)};
637 if (byte1 < 0x80 or between_inc(byte1, 0xa1, 0xdf))
638 return start + 1;
639
640 if (
641 not between_inc(byte1, 0x81, 0x9f) and
642 not between_inc(byte1, 0xe0, 0xfc))
643 PQXX_UNLIKELY
644 throw_for_encoding_error("SJIS", buffer, start, 1);
645
646 if (start + 2 > buffer_len)
647 PQXX_UNLIKELY
648 throw_for_encoding_error("SJIS", buffer, start, buffer_len - start);
649
650 auto const byte2{get_byte(buffer, start + 1)};
651 if (byte2 == 0x7f)
652 PQXX_UNLIKELY
653 throw_for_encoding_error("SJIS", buffer, start, 2);
654
655 if (between_inc(byte2, 0x40, 0x9e) or between_inc(byte2, 0x9f, 0xfc))
656 return start + 2;
657
658 PQXX_UNLIKELY
659 throw_for_encoding_error("SJIS", buffer, start, 2);
660 }
661};
662
663
664// https://en.wikipedia.org/wiki/Unified_Hangul_Code
665template<> struct glyph_scanner<encoding_group::UHC>
666{
667 static PQXX_PURE std::size_t
668 call(char const buffer[], std::size_t buffer_len, std::size_t start)
669 {
670 if (start >= buffer_len)
671 PQXX_UNLIKELY return std::string::npos;
672
673 auto const byte1{get_byte(buffer, start)};
674 if (byte1 < 0x80)
675 return start + 1;
676
677 if (start + 2 > buffer_len)
678 PQXX_UNLIKELY
679 throw_for_encoding_error("UHC", buffer, start, buffer_len - start);
680
681 auto const byte2{get_byte(buffer, start + 1)};
682 if (between_inc(byte1, 0x80, 0xc6))
683 {
684 if (
685 between_inc(byte2, 0x41, 0x5a) or between_inc(byte2, 0x61, 0x7a) or
686 between_inc(byte2, 0x80, 0xfe))
687 return start + 2;
688
689 PQXX_UNLIKELY
690 throw_for_encoding_error("UHC", buffer, start, 2);
691 }
692
693 if (between_inc(byte1, 0xa1, 0xfe))
694 {
695 if (not between_inc(byte2, 0xa1, 0xfe))
696 PQXX_UNLIKELY
697 throw_for_encoding_error("UHC", buffer, start, 2);
698
699 return start + 2;
700 }
701
702 throw_for_encoding_error("UHC", buffer, start, 1);
703 }
704};
705
706
707// https://en.wikipedia.org/wiki/UTF-8#Description
708template<> struct glyph_scanner<encoding_group::UTF8>
709{
710 static PQXX_PURE std::size_t
711 call(char const buffer[], std::size_t buffer_len, std::size_t start)
712 {
713 if (start >= buffer_len)
714 PQXX_UNLIKELY return std::string::npos;
715
716 auto const byte1{get_byte(buffer, start)};
717 if (byte1 < 0x80)
718 return start + 1;
719
720 if (start + 2 > buffer_len)
721 PQXX_UNLIKELY
722 throw_for_encoding_error("UTF8", buffer, start, buffer_len - start);
723
724 auto const byte2{get_byte(buffer, start + 1)};
725 if (between_inc(byte1, 0xc0, 0xdf))
726 {
727 if (not between_inc(byte2, 0x80, 0xbf))
728 PQXX_UNLIKELY
729 throw_for_encoding_error("UTF8", buffer, start, 2);
730
731 return start + 2;
732 }
733
734 if (start + 3 > buffer_len)
735 PQXX_UNLIKELY
736 throw_for_encoding_error("UTF8", buffer, start, buffer_len - start);
737
738 auto const byte3{get_byte(buffer, start + 2)};
739 if (between_inc(byte1, 0xe0, 0xef))
740 {
741 if (between_inc(byte2, 0x80, 0xbf) and between_inc(byte3, 0x80, 0xbf))
742 return start + 3;
743
744 PQXX_UNLIKELY
745 throw_for_encoding_error("UTF8", buffer, start, 3);
746 }
747
748 if (start + 4 > buffer_len)
749 PQXX_UNLIKELY
750 throw_for_encoding_error("UTF8", buffer, start, buffer_len - start);
751
752 if (between_inc(byte1, 0xf0, 0xf7))
753 {
754 if (
755 between_inc(byte2, 0x80, 0xbf) and between_inc(byte3, 0x80, 0xbf) and
756 between_inc(get_byte(buffer, start + 3), 0x80, 0xbf))
757 return start + 4;
758
759 PQXX_UNLIKELY
760 throw_for_encoding_error("UTF8", buffer, start, 4);
761 }
762
763 PQXX_UNLIKELY
764 throw_for_encoding_error("UTF8", buffer, start, 1);
765 }
766};
767
768
770
784constexpr inline encoding_group
785map_ascii_search_group(encoding_group enc) noexcept
786{
787 switch (enc)
788 {
789 case encoding_group::MONOBYTE:
790 case encoding_group::EUC_CN:
791 case encoding_group::EUC_JP:
792 case encoding_group::EUC_KR:
793 case encoding_group::EUC_TW:
794 case encoding_group::MULE_INTERNAL:
795 case encoding_group::UTF8:
796 // All these encodings are "ASCII-safe," meaning that if we're looking
797 // for a particular ASCII character, we can safely just go through the
798 // string byte for byte. Multibyte characters have the high bit set.
799 return encoding_group::MONOBYTE;
800
801 default: PQXX_UNLIKELY return enc;
802 }
803}
804
805
807
813template<char... NEEDLE>
814PQXX_PURE constexpr inline char_finder_func *
815get_char_finder(encoding_group enc)
816{
817 auto const as_if{map_ascii_search_group(enc)};
818 switch (as_if)
819 {
820 case encoding_group::MONOBYTE:
821 return pqxx::internal::find_ascii_char<
822 encoding_group::MONOBYTE, NEEDLE...>;
823 case encoding_group::BIG5:
824 return pqxx::internal::find_ascii_char<encoding_group::BIG5, NEEDLE...>;
825 case encoding_group::GB18030:
826 return pqxx::internal::find_ascii_char<encoding_group::GB18030, NEEDLE...>;
827 case encoding_group::GBK:
828 return pqxx::internal::find_ascii_char<encoding_group::GBK, NEEDLE...>;
829 case encoding_group::JOHAB:
830 return pqxx::internal::find_ascii_char<encoding_group::JOHAB, NEEDLE...>;
831 case encoding_group::SJIS:
832 return pqxx::internal::find_ascii_char<encoding_group::SJIS, NEEDLE...>;
833 case encoding_group::UHC:
834 return pqxx::internal::find_ascii_char<encoding_group::UHC, NEEDLE...>;
835
836 default:
838 "Unexpected encoding group: ", as_if, " (mapped from ", enc, ").")};
839 }
840}
841
842
844
847template<char... NEEDLE>
848PQXX_PURE constexpr inline char_finder_func *
849get_s_char_finder(encoding_group enc)
850{
851 auto const as_if{map_ascii_search_group(enc)};
852 switch (as_if)
853 {
854 case encoding_group::MONOBYTE:
856 encoding_group::MONOBYTE, NEEDLE...>;
857 case encoding_group::BIG5:
858 return pqxx::internal::find_s_ascii_char<encoding_group::BIG5, NEEDLE...>;
859 case encoding_group::GB18030:
861 encoding_group::GB18030, NEEDLE...>;
862 case encoding_group::GBK:
863 return pqxx::internal::find_s_ascii_char<encoding_group::GBK, NEEDLE...>;
864 case encoding_group::JOHAB:
865 return pqxx::internal::find_s_ascii_char<encoding_group::JOHAB, NEEDLE...>;
866 case encoding_group::SJIS:
867 return pqxx::internal::find_s_ascii_char<encoding_group::SJIS, NEEDLE...>;
868 case encoding_group::UHC:
869 return pqxx::internal::find_s_ascii_char<encoding_group::UHC, NEEDLE...>;
870
871 default:
873 "Unexpected encoding group: ", as_if, " (mapped from ", enc, ").")};
874 }
875}
876} // namespace pqxx::internal
877#endif
Invalid argument passed to libpqxx, similar to std::invalid_argument.
Definition: except.hxx:266
Internal error in libpqxx library.
Definition: except.hxx:242
Internal items for libpqxx' own use. Do not use these yourself.
Definition: encodings.cxx:33
std::string concat(TYPE... item)
Efficiently combine a bunch of items into one big string.
Definition: concat.hxx:31
PQXX_PURE constexpr char_finder_func * get_char_finder(encoding_group enc)
Look up a character search function for an encoding group.
Definition: encodings.hxx:815
PQXX_PURE std::size_t find_s_ascii_char(std::string_view haystack, std::size_t here)
Find first of NEEDLE ASCII chars in haystack.
Definition: encodings.hxx:211
PQXX_PURE char const * name_encoding(int encoding_id)
Return PostgreSQL's name for encoding enum value.
std::size_t(char const buffer[], std::size_t buffer_len, std::size_t start) glyph_scanner_func
Function type: "find the end of the current glyph.".
Definition: encoding_group.hxx:53
PQXX_PURE constexpr char_finder_func * get_s_char_finder(encoding_group enc)
Look up a "sentry" character search function for an encoding group.
Definition: encodings.hxx:849
pqxx::internal::encoding_group enc_group(std::string_view encoding_name)
Convert libpq encoding name to its libpqxx encoding group.
Definition: encodings.cxx:35
void for_glyphs(encoding_group enc, CALLABLE callback, char const buffer[], std::size_t buffer_len, std::size_t start=0)
Iterate over the glyphs in a buffer.
Definition: encodings.hxx:87
PQXX_LIBEXPORT glyph_scanner_func * get_glyph_scanner(encoding_group)
Look up the glyph scanner function for a given encoding group.
std::size_t(std::string_view haystack, std::size_t start) char_finder_func
Function type: "find first occurrence of specific any of ASCII characters.".
Definition: encoding_group.hxx:71
std::size_t find_char(glyph_scanner_func *scanner, std::string_view haystack, std::size_t here=0u)
Find any of the ASCII characters NEEDLE in haystack.
Definition: encodings.hxx:52
constexpr encoding_group map_ascii_search_group(encoding_group enc) noexcept
Just for searching an ASCII character, what encoding can we use here?
Definition: encodings.hxx:785
The home of all libpqxx classes, functions, templates, etc.
Definition: array.cxx:27
Wrapper struct template for "find next glyph" functions.
Definition: encodings.hxx:143
static PQXX_PURE std::size_t call(char const buffer[], std::size_t buffer_len, std::size_t start)
Find the next glyph in buffer after position start.