UCommon
unicode.h
Go to the documentation of this file.
1 // Copyright (C) 2009-2014 David Sugar, Tycho Softworks.
2 // Copyright (C) 2015 Cherokees of Idaho.
3 //
4 // This file is part of GNU uCommon C++.
5 //
6 // GNU uCommon C++ is free software: you can redistribute it and/or modify
7 // it under the terms of the GNU Lesser General Public License as published
8 // by the Free Software Foundation, either version 3 of the License, or
9 // (at your option) any later version.
10 //
11 // GNU uCommon C++ is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU Lesser General Public License for more details.
15 //
16 // You should have received a copy of the GNU Lesser General Public License
17 // along with GNU uCommon C++. If not, see <http://www.gnu.org/licenses/>.
18 
33 #ifndef _UCOMMON_UNICODE_H_
34 #define _UCOMMON_UNICODE_H_
35 
36 #ifndef _UCOMMON_STRING_H_
37 #include <ucommon/string.h>
38 #endif
39 
40 #ifdef nil
41 #undef nil
42 #endif
43 
44 namespace ucommon {
45 
50 typedef int32_t ucs4_t;
51 
55 typedef int16_t ucs2_t;
56 
60 typedef void *unicode_t;
61 
67 class __EXPORT utf8
68 {
69 protected:
70  inline utf8() {};
71 
72  inline utf8(const utf8& copy) {};
73 
74 public:
78  static const unsigned ucsize;
79 
83  static const char *nil;
84 
90  static unsigned size(const char *codepoint);
91 
97  static size_t count(const char *string);
98 
105  static char *offset(char *string, ssize_t position);
106 
112  static ucs4_t codepoint(const char *encoded);
113 
119  static size_t chars(const unicode_t string);
120 
126  static size_t chars(ucs4_t character);
127 
134  static size_t unpack(const unicode_t string, char *text, size_t size);
135 
143  static size_t pack(unicode_t unicode, const char *cp, size_t len);
144 
148  static ucs4_t *udup(const char *string);
149 
153  static ucs2_t *wdup(const char *string);
154 
162  static const char *find(const char *string, ucs4_t character, size_t start = 0);
163 
171  static const char *rfind(const char *string, ucs4_t character, size_t end = (size_t)-1l);
172 
179  static unsigned ccount(const char *string, ucs4_t character);
180 
186  static ucs4_t get(const char *cp);
187 
194  static void put(ucs4_t character, char *buf);
195 };
196 
203 class __EXPORT UString : public String, public utf8
204 {
205 protected:
209  UString();
210 
215  UString(size_t size);
216 
221  UString(const unicode_t text);
222 
229  UString(const char *text, size_t size);
230 
237  UString(const unicode_t *text, const unicode_t *end);
238 
244  UString(const UString& existing);
245 
250  virtual ~UString();
251 
258  UString get(size_t codepoint, size_t size = 0) const;
259 
266  size_t get(unicode_t unicode, size_t size) const;
267 
272  void set(const unicode_t unicode);
273 
278  void add(const unicode_t unicode);
279 
285  ucs4_t at(int position) const;
286 
293  inline size_t operator()(unicode_t unicode, size_t size) const {
294  return get(unicode, size);
295  }
296 
303  UString operator()(int codepoint, size_t size) const;
304 
310  inline UString left(size_t size) const {
311  return operator()(0, size);
312  }
313 
319  inline UString right(size_t offset) const {
320  return operator()(-((int)offset), 0);
321  }
322 
329  inline UString copy(size_t offset, size_t size) const {
330  return operator()((int)offset, size);
331  }
332 
338  void cut(size_t offset, size_t size = 0);
339 
346  void paste(size_t offset, const char *text, size_t size = 0);
347 
355  const char *operator()(int offset) const;
356 
362  inline ucs4_t operator[](int position) const {
363  return UString::at(position);
364  }
365 
370  inline size_t count(void) const {
371  return (size_t)utf8::count(str->text);
372  }
373 
379  unsigned ccount(ucs4_t character) const;
380 
387  const char *find(ucs4_t character, size_t start = 0) const;
388 
395  const char *rfind(ucs4_t character, size_t end = npos) const;
396 };
397 
403 class __EXPORT utf8_pointer
404 {
405 protected:
406  uint8_t *text;
407 
408 public:
412  utf8_pointer();
413 
418  utf8_pointer(const char *string);
419 
425 
430  utf8_pointer& operator ++();
431 
436  utf8_pointer& operator --();
437 
443  utf8_pointer& operator +=(long offset);
444 
450  utf8_pointer& operator -=(long offset);
451 
457  utf8_pointer operator+(long offset) const;
458 
464  utf8_pointer operator-(long offset) const;
465 
470  inline operator bool() const {
471  return text != NULL;
472  }
473 
478  inline bool operator!() const {
479  return text == NULL;
480  }
481 
487  ucs4_t operator[](long codepoint) const;
488 
494  utf8_pointer& operator=(const char *string);
495 
499  void inc(void);
500 
504  void dec(void);
505 
511  inline bool operator==(const char *string) const {
512  return (const char *)text == string;
513  }
514 
520  inline bool operator!=(const char *string) const {
521  return (const char *)text != string;
522  }
523 
528  inline ucs4_t operator*() const {
529  return utf8::codepoint((const char *)text);
530  }
531 
536  inline char *c_str(void) const {
537  return (char *)text;
538  }
539 
544  inline operator char*() const {
545  return (char *)text;
546  }
547 
552  inline size_t len(void) const {
553  return utf8::count((const char *)text);
554  }
555 };
556 
557 inline ucs4_t *strudup(const char *string) {
558  return utf8::udup(string);
559 }
560 
561 inline ucs2_t *strwdup(const char *string) {
562  return utf8::wdup(string);
563 }
564 
565 __EXPORT unicode_t unidup(const char *string);
566 
567 template<>
568 inline void dupfree<ucs2_t*>(ucs2_t *string) {
569  ::free(string);
570 }
571 
572 template<>
573 inline void dupfree<ucs4_t*>(ucs4_t *string) {
574  ::free(string);
575 }
576 
577 template<>
578 inline void dupfree<unicode_t>(unicode_t string) {
579  ::free(string);
580 }
581 
586 
591 
592 } // namespace ucommon
593 
594 #endif
ucs4_t at(int position) const
Return unicode character found at a specific codepoint in the string.
static ucs2_t * wdup(const char *string)
Dup a utf8 string into a ucs2_t representation.
A copy-on-write utf8 string class that operates by reference count.
Definition: unicode.h:203
char * c_str(void) const
Get c string we point to.
Definition: unicode.h:536
ucs4_t operator[](int position) const
Reference a unicode character in string object by array offset.
Definition: unicode.h:362
size_t operator()(unicode_t unicode, size_t size) const
Extract a unicode byte sequence from utf8 object.
Definition: unicode.h:293
A copy-on-write string class that operates by reference count.
Definition: string.h:78
size_t len(void) const
Get length of null terminated utf8 string in codepoints.
Definition: unicode.h:552
static size_t count(const char *string)
Count ut8 encoded ucs4 codepoints in string.
A common string class and character string support functions.
size_t count(void) const
Count codepoints in current string.
Definition: unicode.h:370
int16_t ucs2_t
16 bit unicode character code.
Definition: unicode.h:55
static ucs4_t codepoint(const char *encoded)
Convert a utf8 encoded codepoint to a ucs4 character value.
utf8_pointer utf8_t
Convenience type for utf8_pointer strings.
Definition: unicode.h:590
int32_t ucs4_t
32 bit unicode character code.
Definition: unicode.h:50
A core class of ut8 encoded string functions.
Definition: unicode.h:67
bool operator!() const
Check if text is an invalid pointer.
Definition: unicode.h:478
Common namespace for all ucommon objects.
Definition: access.h:47
static ucs4_t * udup(const char *string)
Dup a utf8 string into a ucs4_t string.
T copy(const T &src)
Convenience function to copy objects.
Definition: generics.h:395
bool operator==(const char *string) const
check if pointer equals another string.
Definition: unicode.h:511
Pointer to utf8 encoded character data.
Definition: unicode.h:403
UString copy(size_t offset, size_t size) const
Convenience method for substring extraction.
Definition: unicode.h:329
static const char * nil
A convenient NULL pointer value.
Definition: unicode.h:83
UString ustring_t
Convenience type for utf8 encoded strings.
Definition: unicode.h:585
bool operator!=(const char *string) const
check if pointer does not equal another string.
Definition: unicode.h:520
void * unicode_t
Resolves issues where wchar_t is not defined.
Definition: unicode.h:60
UString right(size_t offset) const
Convenience method for right of string.
Definition: unicode.h:319
UString left(size_t size) const
Convenience method for left of string.
Definition: unicode.h:310