-
Notifications
You must be signed in to change notification settings - Fork 1
/
utf8.c
132 lines (124 loc) · 4.02 KB
/
utf8.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/* The FreeDOS-32 Unicode Support Library version 2.1
* Copyright (C) 2001-2006 Salvatore ISAJA
*
* This file "utf8.c" is part of the FreeDOS-32 Unicode
* Support Library (the Program).
*
* The Program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* The Program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with the Program; see the file GPL.txt; if not, write to
* the Free Software Foundation, Inc.,
* 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
*/
#include <config.h>
#include "unicode.h"
/**
* \addtogroup unicode
* @{
*/
/* Bit mask and bit values of a UTF-8 character lead byte */
static struct { char mask; char val; } t[4] =
{ { 0x80, 0x00 }, { 0xE0, 0xC0 }, { 0xF0, 0xE0 }, { 0xF8, 0xF0 } };
/**
* \brief Gets the length of a UTF-8 character.
* \param lead_byte the first byte of a UTF-8 character;
* \retval >0 the length in bytes of the UTF-8 character;
* \retval -EILSEQ invalid UTF-8 lead byte;
* \remarks For performance reasons, this function does not parse
* the whole UTF-8 byte sequence, just the first byte.
* If checking the validity of the whole UTF-8 byte sequence
* is needed, use unicode_utf8_to_wchar().
*/
int unicode_utf8_len(char lead_byte)
{
int k;
for (k = 0; k < 4; k++)
if ((lead_byte & t[k].mask) == t[k].val)
return k + 1;
return -EILSEQ;
}
/**
* \brief UTF-8 to wide character.
* \param result where to store the converted wide character;
* \param string buffer containing the UTF-8 character to convert;
* \param size max number of bytes of \c string to examine;
* \retval >0 the length in bytes of the processed UTF-8 character, the wide character is stored in \c result;
* \retval -EILSEQ invalid UTF-8 byte sequence;
* \retval -ENAMETOOLONG \c size too small to parse the UTF-8 character.
*/
int unicode_utf8_to_wchar(wchar_t *restrict result, const char *restrict string, size_t size)
{
wchar_t wc = 0;
unsigned k, j;
if (!size) return -ENAMETOOLONG;
for (k = 0; k < 4; k++)
if ((*string & t[k].mask) == t[k].val)
{
if (size < k + 1) return -ENAMETOOLONG;
wc = (wchar_t) (unsigned char) *string & ~t[k].mask;
for (j = 0; j < k; j++)
{
if ((*(++string) & 0xC0) != 0x80) return -EILSEQ;
wc = (wc << 6) | ((wchar_t) (unsigned char) *string & 0x3F);
}
*result = wc;
return k + 1;
}
return -EILSEQ;
}
/**
* \brief Wide character to UTF-8.
* \param s where to store the converted UTF-8 character;
* \param wc the wide character to convert;
* \param size max number of bytes to store in \c s;
* \retval >0 the length in bytes of the converted UTF-8 character, stored in \c s;
* \retval -EINVAL invalid wide character (don't know how to convert it to UTF-8);
* \retval -ENAMETOOLONG \c size too small to store the UTF-8 character.
*/
int unicode_wchar_to_utf8(char *s, wchar_t wc, size_t size)
{
if (wc >= 0)
{
if (wc < 0x000080)
{
if (size < 1) return -ENAMETOOLONG;
*s = (char) wc;
return 1;
}
if (wc < 0x000800)
{
if (size < 2) return -ENAMETOOLONG;
*(s + 1) = (char) (0x80 | (wc & 0x3F)); wc >>= 6;
*s = (char) (0xC0 | wc);
return 2;
}
if (wc < 0x010000)
{
if (size < 3) return -ENAMETOOLONG;
*(s + 2) = (char) (0x80 | (wc & 0x3F)); wc >>= 6;
*(s + 1) = (char) (0x80 | (wc & 0x3F)); wc >>= 6;
*s = (char) (0xE0 | wc);
return 3;
}
if (wc < 0x200000)
{
if (size < 4) return -ENAMETOOLONG;
*(s + 3) = (char) (0x80 | (wc & 0x3F)); wc >>= 6;
*(s + 2) = (char) (0x80 | (wc & 0x3F)); wc >>= 6;
*(s + 1) = (char) (0x80 | (wc & 0x3F)); wc >>= 6;
*s = (char) (0xF0 | wc);
return 4;
}
}
return -EINVAL;
}
/* @} */