Bike-X  0.8
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros
OVR_UTF8Util.cpp
Go to the documentation of this file.
1 /**************************************************************************
2 
3 Filename : OVR_UTF8Util.cpp
4 Content : UTF8 Unicode character encoding/decoding support
5 Created : September 19, 2012
6 Notes :
7 Notes : Much useful info at "UTF-8 and Unicode FAQ"
8  http://www.cl.cam.ac.uk/~mgk25/unicode.html
9 
10 Copyright : Copyright 2014 Oculus VR, Inc. All Rights reserved.
11 
12 Licensed under the Oculus VR Rift SDK License Version 3.1 (the "License");
13 you may not use the Oculus VR Rift SDK except in compliance with the License,
14 which is provided at the time of installation or download, or which
15 otherwise accompanies this software in either electronic or hard copy form.
16 
17 You may obtain a copy of the License at
18 
19 http://www.oculusvr.com/licenses/LICENSE-3.1
20 
21 Unless required by applicable law or agreed to in writing, the Oculus VR SDK
22 distributed under the License is distributed on an "AS IS" BASIS,
23 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
24 See the License for the specific language governing permissions and
25 limitations under the License.
26 
27 ************************************************************************************/
28 
29 #include "OVR_UTF8Util.h"
30 
31 namespace OVR { namespace UTF8Util {
32 
33 SPInt OVR_STDCALL GetLength(const char* buf, SPInt buflen)
34 {
35  const char* p = buf;
36  SPInt length = 0;
37 
38  if (buflen != -1)
39  {
40  while (p - buf < buflen)
41  {
42  // We should be able to have ASStrings with 0 in the middle.
44  length++;
45  }
46  }
47  else
48  {
50  length++;
51  }
52 
53  return length;
54 }
55 
56 UInt32 OVR_STDCALL GetCharAt(SPInt index, const char* putf8str, SPInt length)
57 {
58  const char* buf = putf8str;
59  UInt32 c = 0;
60 
61  if (length != -1)
62  {
63  while (buf - putf8str < length)
64  {
66  if (index == 0)
67  return c;
68  index--;
69  }
70 
71  return c;
72  }
73 
74  do
75  {
77  index--;
78 
79  if (c == 0)
80  {
81  // We've hit the end of the string; don't go further.
82  OVR_ASSERT(index == 0);
83  return c;
84  }
85  } while (index >= 0);
86 
87  return c;
88 }
89 
90 SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)
91 {
92  const char* buf = putf8str;
93 
94  if (length != -1)
95  {
96  while ((buf - putf8str) < length && index > 0)
97  {
99  index--;
100  }
101 
102  return buf-putf8str;
103  }
104 
105  while (index > 0)
106  {
108  index--;
109 
110  if (c == 0)
111  return buf-putf8str;
112  };
113 
114  return buf-putf8str;
115 }
116 
118 {
119  if (ucs_character <= 0x7F)
120  return 1;
121  else if (ucs_character <= 0x7FF)
122  return 2;
123  else if (ucs_character <= 0xFFFF)
124  return 3;
125  else if (ucs_character <= 0x1FFFFF)
126  return 4;
127  else if (ucs_character <= 0x3FFFFFF)
128  return 5;
129  else if (ucs_character <= 0x7FFFFFFF)
130  return 6;
131  else
132  return 0;
133 }
134 
135 UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char** putf8Buffer)
136 {
137  UInt32 uc;
138  char c;
139 
140  // Security considerations:
141  //
142  // Changed, this is now only the case for DecodeNextChar:
143  // - If we hit a zero byte, we want to return 0 without stepping
144  // the buffer pointer past the 0. th
145  //
146  // If we hit an "overlong sequence"; i.e. a character encoded
147  // in a longer multibyte string than is necessary, then we
148  // need to discard the character. This is so attackers can't
149  // disguise dangerous characters or character sequences --
150  // there is only one valid encoding for each character.
151  //
152  // If we decode characters { 0xD800 .. 0xDFFF } or { 0xFFFE,
153  // 0xFFFF } then we ignore them; they are not valid in UTF-8.
154 
155  // This isn't actually an invalid character; it's a valid char that
156  // looks like an inverted question mark.
157 #define INVALID_CHAR 0x0FFFD
158 
159 #define FIRST_BYTE(mask, shift) \
160  uc = (c & (mask)) << (shift);
161 
162 #define NEXT_BYTE(shift) \
163  c = **putf8Buffer; \
164  if (c == 0) return 0; /* end of buffer, do not advance */ \
165  if ((c & 0xC0) != 0x80) return INVALID_CHAR; /* standard check */ \
166  (*putf8Buffer)++; \
167  uc |= (c & 0x3F) << shift;
168 
169  c = **putf8Buffer;
170  (*putf8Buffer)++;
171  if (c == 0)
172  return 0; // End of buffer.
173 
174  if ((c & 0x80) == 0) return (UInt32) c; // Conventional 7-bit ASCII.
175 
176  // Multi-byte sequences.
177  if ((c & 0xE0) == 0xC0)
178  {
179  // Two-byte sequence.
180  FIRST_BYTE(0x1F, 6);
181  NEXT_BYTE(0);
182  if (uc < 0x80) return INVALID_CHAR; // overlong
183  return uc;
184  }
185  else if ((c & 0xF0) == 0xE0)
186  {
187  // Three-byte sequence.
188  FIRST_BYTE(0x0F, 12);
189  NEXT_BYTE(6);
190  NEXT_BYTE(0);
191  if (uc < 0x800) return INVALID_CHAR; // overlong
192  // Not valid ISO 10646, but Flash requires these to work
193  // see AS3 test e15_5_3_2_3 for String.fromCharCode().charCodeAt(0)
194  // if (uc >= 0x0D800 && uc <= 0x0DFFF) return INVALID_CHAR;
195  // if (uc == 0x0FFFE || uc == 0x0FFFF) return INVALID_CHAR; // not valid ISO 10646
196  return uc;
197  }
198  else if ((c & 0xF8) == 0xF0)
199  {
200  // Four-byte sequence.
201  FIRST_BYTE(0x07, 18);
202  NEXT_BYTE(12);
203  NEXT_BYTE(6);
204  NEXT_BYTE(0);
205  if (uc < 0x010000) return INVALID_CHAR; // overlong
206  return uc;
207  }
208  else if ((c & 0xFC) == 0xF8)
209  {
210  // Five-byte sequence.
211  FIRST_BYTE(0x03, 24);
212  NEXT_BYTE(18);
213  NEXT_BYTE(12);
214  NEXT_BYTE(6);
215  NEXT_BYTE(0);
216  if (uc < 0x0200000) return INVALID_CHAR; // overlong
217  return uc;
218  }
219  else if ((c & 0xFE) == 0xFC)
220  {
221  // Six-byte sequence.
222  FIRST_BYTE(0x01, 30);
223  NEXT_BYTE(24);
224  NEXT_BYTE(18);
225  NEXT_BYTE(12);
226  NEXT_BYTE(6);
227  NEXT_BYTE(0);
228  if (uc < 0x04000000) return INVALID_CHAR; // overlong
229  return uc;
230  }
231  else
232  {
233  // Invalid.
234  return INVALID_CHAR;
235  }
236 }
237 
238 
239 void OVR_STDCALL EncodeChar(char* pbuffer, SPInt* pindex, UInt32 ucs_character)
240 {
241  if (ucs_character <= 0x7F)
242  {
243  // Plain single-byte ASCII.
244  pbuffer[(*pindex)++] = (char) ucs_character;
245  }
246  else if (ucs_character <= 0x7FF)
247  {
248  // Two bytes.
249  pbuffer[(*pindex)++] = 0xC0 | (char)(ucs_character >> 6);
250  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
251  }
252  else if (ucs_character <= 0xFFFF)
253  {
254  // Three bytes.
255  pbuffer[(*pindex)++] = 0xE0 | (char)(ucs_character >> 12);
256  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
257  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
258  }
259  else if (ucs_character <= 0x1FFFFF)
260  {
261  // Four bytes.
262  pbuffer[(*pindex)++] = 0xF0 | (char)(ucs_character >> 18);
263  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
264  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
265  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
266  }
267  else if (ucs_character <= 0x3FFFFFF)
268  {
269  // Five bytes.
270  pbuffer[(*pindex)++] = 0xF8 | (char)(ucs_character >> 24);
271  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
272  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
273  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
274  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
275  }
276  else if (ucs_character <= 0x7FFFFFFF)
277  {
278  // Six bytes.
279  pbuffer[(*pindex)++] = 0xFC | (char)(ucs_character >> 30);
280  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 24) & 0x3F);
281  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 18) & 0x3F);
282  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 12) & 0x3F);
283  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 6) & 0x3F);
284  pbuffer[(*pindex)++] = 0x80 | (char)((ucs_character >> 0) & 0x3F);
285  }
286  else
287  {
288  // Invalid char; don't encode anything.
289  }
290 }
291 
292 SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t* pchar, SPInt length)
293 {
294  SPInt len = 0;
295  if (length != -1)
296  for (int i = 0; i < length; i++)
297  {
298  len += GetEncodeCharSize(pchar[i]);
299  }
300  else
301  for (int i = 0;; i++)
302  {
303  if (pchar[i] == 0)
304  return len;
305  len += GetEncodeCharSize(pchar[i]);
306  }
307  return len;
308 }
309 
310 void OVR_STDCALL EncodeString(char *pbuff, const wchar_t* pchar, SPInt length)
311 {
312  SPInt ofs = 0;
313  if (length != -1)
314  {
315  for (int i = 0; i < length; i++)
316  {
317  EncodeChar(pbuff, &ofs, pchar[i]);
318  }
319  }
320  else
321  {
322  for (int i = 0;; i++)
323  {
324  if (pchar[i] == 0)
325  break;
326  EncodeChar(pbuff, &ofs, pchar[i]);
327  }
328  }
329  pbuff[ofs] = 0;
330 }
331 
332 UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char* putf8str, SPInt bytesLen)
333 {
334  wchar_t *pbegin = pbuff;
335  if (bytesLen == -1)
336  {
337  while (1)
338  {
339  UInt32 ch = DecodeNextChar_Advance0(&putf8str);
340  if (ch == 0)
341  break;
342  else if (ch >= 0xFFFF)
343  ch = 0xFFFD;
344  *pbuff++ = wchar_t(ch);
345  }
346  }
347  else
348  {
349  const char* p = putf8str;
350  while ((p - putf8str) < bytesLen)
351  {
353  if (ch >= 0xFFFF)
354  ch = 0xFFFD;
355  *pbuff++ = wchar_t(ch);
356  }
357  }
358 
359  *pbuff = 0;
360  return pbuff - pbegin;
361 }
362 
363 
364 #ifdef UTF8_UNIT_TEST
365 
366 // Compile this test case with something like:
367 //
368 // gcc utf8.cpp -g -I.. -DUTF8_UNIT_TEST -lstdc++ -o utf8_test
369 //
370 // or
371 //
372 // cl utf8.cpp -Zi -Od -DUTF8_UNIT_TEST -I..
373 //
374 // If possible, try running the test program with the first arg
375 // pointing at the file:
376 //
377 // http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
378 //
379 // and examine the results by eye to make sure they are acceptable to
380 // you.
381 
382 
383 #include "base/utility.h"
384 #include <stdio.h>
385 
386 
387 bool check_equal(const char* utf8_in, const UInt32* ucs_in)
388 {
389  for (;;)
390  {
391  UInt32 next_ucs = *ucs_in++;
392  UInt32 next_ucs_from_utf8 = utf8::decode_next_unicode_character(&utf8_in);
393  if (next_ucs != next_ucs_from_utf8)
394  {
395  return false;
396  }
397  if (next_ucs == 0)
398  {
399  OVR_ASSERT(next_ucs_from_utf8 == 0);
400  break;
401  }
402  }
403 
404  return true;
405 }
406 
407 
408 void log_ascii(const char* line)
409 {
410  for (;;)
411  {
412  unsigned char c = (unsigned char) *line++;
413  if (c == 0)
414  {
415  // End of line.
416  return;
417  }
418  else if (c != '\n'
419  && (c < 32 || c > 127))
420  {
421  // Non-printable as plain ASCII.
422  printf("<0x%02X>", (int) c);
423  }
424  else
425  {
426  printf("%c", c);
427  }
428  }
429 }
430 
431 
432 void log_ucs(const UInt32* line)
433 {
434  for (;;)
435  {
436  UInt32 uc = *line++;
437  if (uc == 0)
438  {
439  // End of line.
440  return;
441  }
442  else if (uc != '\n'
443  && (uc < 32 || uc > 127))
444  {
445  // Non-printable as plain ASCII.
446  printf("<U-%04X>", uc);
447  }
448  else
449  {
450  printf("%c", (char) uc);
451  }
452  }
453 }
454 
455 
456 // Simple canned test.
457 int main(int argc, const char* argv[])
458 {
459  {
460  const char* test8 = "Ignacio CastaƱo";
461  const UInt32 test32[] =
462  {
463  0x49, 0x67, 0x6E, 0x61, 0x63,
464  0x69, 0x6F, 0x20, 0x43, 0x61,
465  0x73, 0x74, 0x61, 0xF1, 0x6F,
466  0x00
467  };
468 
469  OVR_ASSERT(check_equal(test8, test32));
470  }
471 
472  // If user passed an arg, try reading the file as UTF-8 encoded text.
473  if (argc > 1)
474  {
475  const char* filename = argv[1];
476  FILE* fp = fopen(filename, "rb");
477  if (fp == NULL)
478  {
479  printf("Can't open file '%s'\n", filename);
480  return 1;
481  }
482 
483  // Read lines from the file, encode/decode them, and highlight discrepancies.
484  const int LINE_SIZE = 200; // max line size
485  char line_buffer_utf8[LINE_SIZE];
486  char reencoded_utf8[6 * LINE_SIZE];
487  UInt32 line_buffer_ucs[LINE_SIZE];
488 
489  int byte_counter = 0;
490  for (;;)
491  {
492  int c = fgetc(fp);
493  if (c == EOF)
494  {
495  // Done.
496  break;
497  }
498  line_buffer_utf8[byte_counter++] = c;
499  if (c == '\n' || byte_counter >= LINE_SIZE - 2)
500  {
501  // End of line. Process the line.
502  line_buffer_utf8[byte_counter++] = 0; // terminate.
503 
504  // Decode into UCS.
505  const char* p = line_buffer_utf8;
506  UInt32* q = line_buffer_ucs;
507  for (;;)
508  {
510  *q++ = uc;
511 
512  OVR_ASSERT(q < line_buffer_ucs + LINE_SIZE);
513  OVR_ASSERT(p < line_buffer_utf8 + LINE_SIZE);
514 
515  if (uc == 0) break;
516  }
517 
518  // Encode back into UTF-8.
519  q = line_buffer_ucs;
520  int index = 0;
521  for (;;)
522  {
523  UInt32 uc = *q++;
524  OVR_ASSERT(index < LINE_SIZE * 6 - 6);
525  int last_index = index;
526  UTF8Util::EncodeChar(reencoded_utf8, &index, uc);
527  OVR_ASSERT(index <= last_index + 6);
528  if (uc == 0) break;
529  }
530 
531  // This can be useful for debugging.
532 #if 0
533  // Show the UCS and the re-encoded UTF-8.
534  log_ucs(line_buffer_ucs);
535  log_ascii(reencoded_utf8);
536 #endif // 0
537 
538  OVR_ASSERT(check_equal(line_buffer_utf8, line_buffer_ucs));
539  OVR_ASSERT(check_equal(reencoded_utf8, line_buffer_ucs));
540 
541  // Start next line.
542  byte_counter = 0;
543  }
544  }
545 
546  fclose(fp);
547  }
548 
549  return 0;
550 }
551 
552 
553 #endif // UTF8_UNIT_TEST
554 
555 }} // namespace UTF8Util::OVR
556 
#define NEXT_BYTE(shift)
UInt32 OVR_STDCALL GetCharAt(SPInt index, const char *putf8str, SPInt length)
int OVR_STDCALL GetEncodeCharSize(UInt32 ucs_character)
#define NULL
SPInt OVR_STDCALL GetLength(const char *buf, SPInt buflen)
UInt32 OVR_STDCALL DecodeNextChar_Advance0(const char **putf8Buffer)
UPInt OVR_STDCALL DecodeString(wchar_t *pbuff, const char *putf8str, SPInt bytesLen)
uint32_t UInt32
Definition: OVR_Types.h:253
void OVR_STDCALL EncodeChar(char *pbuffer, SPInt *pindex, UInt32 ucs_character)
size_t UPInt
Definition: OVR_Types.h:218
#define OVR_STDCALL
UInt32 DecodeNextChar(const char **putf8Buffer)
Definition: OVR_UTF8Util.h:88
__WCHAR_TYPE__ wchar_t
#define OVR_ASSERT(p)
int main()
Definition: tests.cpp:6
SPInt OVR_STDCALL GetByteIndex(SPInt index, const char *putf8str, SPInt length)
ptrdiff_t SPInt
Definition: OVR_Types.h:219
void OVR_STDCALL EncodeString(char *pbuff, const wchar_t *pchar, SPInt length)
SPInt OVR_STDCALL GetEncodeStringSize(const wchar_t *pchar, SPInt length)
int char * index(const char *__s, int __c) __THROW __attribute_pure__ __nonnull((1))
#define FIRST_BYTE(mask, shift)
#define INVALID_CHAR