forked from cyisfor/lua_utf8
-
Notifications
You must be signed in to change notification settings - Fork 0
/
utf8.lua
98 lines (90 loc) · 3.58 KB
/
utf8.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
bit32 = bit32 or bit
local function longEncode(codepoint)
local chars = ""
local trailers = 0
local ocodepoint = codepoint
-- feckin backwards compatability
if codepoint < 0x80 then return string.char(codepoint) end
topspace = 0x20 -- we lose a bit of space left on the top every time
-- even if the codepoint is <0x40 and will fit inside 10xxxxxx,
-- we add a 11100000 byte in front, because it won't fit inside
-- 0x20 xxxxx so we need a blank top and an extra continuation.
-- example: 0x90b
-- bit.rshift(0x90b,6) => 0x24
-- 0x24 = 00100100
-- top = 11100000
-- ^ oh noes info lost
-- thus we do:
-- 11100000 - 10100100 - ...
--
while codepoint > topspace do -- as long as there's too much for the top
local derp = bit32.bor(bit32.band(codepoint,0x3F),0x80)
chars = string.char(derp) .. chars
codepoint = bit32.rshift(codepoint,6)
trailers = trailers + 1
topspace = bit32.rshift(topspace,1)
end
-- is there a better way to make 0xFFFF0000 from 4 than lshift/rshift?
local mask = bit32.lshift(bit32.rshift(0xFF,7-trailers),7-trailers)
local last = bit32.bor(mask,codepoint)
return string.char(last) .. chars
end
return {
encode = function(t,derp,...)
if derp ~= nil then
t = {t,derp,...}
end
local s = ""
for i,codepoint in ipairs(t) do
-- manually doing the common codepoints to avoid calling logarithm
if codepoint < 0x80 then
derp = string.char(codepoint)
elseif codepoint < 0x800 then
derp = string.char(bit32.bor(bit32.rshift(codepoint,6),0xc0)) ..
string.char(bit32.bor(bit32.band(codepoint,0x3F),0x80))
elseif codepoint < 0x10000 then
derp = string.char(bit32.bor(bit32.rshift(codepoint,12),0xe0)) ..
string.char(bit32.bor(bit32.band(bit32.rshift(codepoint,6),0x3F),0x80)) ..
string.char(bit32.bor(bit32.band(codepoint,0x3F),0x80))
elseif codepoint < 0x200000 then
derp = string.char(bit32.bor(bit32.rshift(codepoint,18),0xf0)) ..
string.char(bit32.bor(bit32.band(bit32.rshift(codepoint,12),0x3F),0x80)) ..
string.char(bit32.bor(bit32.band(bit32.rshift(codepoint,6),0x3F),0x80)) ..
string.char(bit32.bor(bit32.band(codepoint,0x3F),0x80))
else
-- alpha centauri?!
derp = longEncode(codepoint)
end
s = s .. derp
end
return s
end,
-- got decode from http://lua-users.org/wiki/LuaUnicode
decode = function(s)
assert(type(s) == "string")
local res, seq, val = {}, 0, nil
for i = 1, #s do
local c = string.byte(s, i)
if seq == 0 then
table.insert(res, val)
seq = c < 0x80 and 1 or c < 0xE0 and 2 or c < 0xF0 and 3 or
c < 0xF8 and 4 or c < 0xFC and 5 or c < 0xFE and 6 or
error("invalid UTF-8 character sequence")
val = bit32.band(c, 2^(8-seq) - 1)
else
val = bit32.bor(bit32.lshift(val, 6), bit32.band(c, 0x3F))
end
seq = seq - 1
end
table.insert(res, val)
--table.insert(res, 0)
return res
end,
longEncode = function (t)
local s = ""
for i,codepoint in ipairs(t) do
s = s .. longEncode(codepoint)
end
return s
end
}