zeek/auxil/spicy/3rdparty/utf8proc/data/data_generator.jl
Patrick Kelley 8fd444092b initial
2025-05-07 15:35:15 -04:00

499 lines
18 KiB
Julia

using OffsetArrays: Origin
parsehex(str) = parse(UInt32, str, base=16)
function parse_hex_range(line)
m = match(r"^([0-9A-F]+)(\.\.([0-9A-F]+))? +; +([^#]+)", line)
if isnothing(m)
return nothing
end
i = parsehex(m[1])
j = !isnothing(m[3]) ? parsehex(m[3]) : i
desc = rstrip(m[4])
return (i:j, desc)
end
function read_hex_ranges(filename)
[r for r in parse_hex_range.(readlines(filename)) if !isnothing(r)]
end
function collect_codepoints(range_desc, description)
list = UInt32[]
for (r,d) in range_desc
if d == description
append!(list, r)
end
end
list
end
function set_all!(d, keys, value)
for k in keys
d[k] = value
end
end
#-------------------------------------------------------------------------------
derived_core_properties = read_hex_ranges("DerivedCoreProperties.txt")
ignorable = Set(collect_codepoints(derived_core_properties, "Default_Ignorable_Code_Point"))
uppercase = Set(collect_codepoints(derived_core_properties, "Uppercase"))
lowercase = Set(collect_codepoints(derived_core_properties, "Lowercase"))
#-------------------------------------------------------------------------------
function derive_indic_conjunct_break(derived_core_properties)
props = Dict{UInt32, String}()
set_all!(props, collect_codepoints(derived_core_properties, "InCB; Linker"), "LINKER")
set_all!(props, collect_codepoints(derived_core_properties, "InCB; Consonant"), "CONSONANT")
set_all!(props, collect_codepoints(derived_core_properties, "InCB; Extend"), "EXTEND")
props
end
let indic_conjunct_break = derive_indic_conjunct_break(derived_core_properties)
global function get_indic_conjunct_break(code)
get(indic_conjunct_break, code, "NONE")
end
end
#-------------------------------------------------------------------------------
function read_grapheme_boundclasses(grapheme_break_filename, emoji_data_filename)
grapheme_boundclass = Dict{UInt32, String}()
for (r,desc) in read_hex_ranges(grapheme_break_filename)
set_all!(grapheme_boundclass, r, Base.uppercase(desc))
end
for (r,desc) in read_hex_ranges(emoji_data_filename)
if desc == "Extended_Pictographic"
set_all!(grapheme_boundclass, r, "EXTENDED_PICTOGRAPHIC")
elseif desc == "Emoji_Modifier"
set_all!(grapheme_boundclass, r, "EXTEND")
end
end
return grapheme_boundclass
end
let grapheme_boundclasses = read_grapheme_boundclasses("GraphemeBreakProperty.txt", "emoji-data.txt")
global function get_grapheme_boundclass(code)
get(grapheme_boundclasses, code, "OTHER")
end
end
#-------------------------------------------------------------------------------
function read_composition_exclusions(pattern)
section = match(pattern, read("CompositionExclusions.txt",String)).match
es = UInt32[]
for line in split(section, '\n')
m = match(r"^([0-9A-F]+) +#"i, line)
if !isnothing(m)
push!(es, parsehex(m[1]))
end
end
es
end
exclusions = Set(read_composition_exclusions(r"# \(1\) Script Specifics.*?# Total code points:"s))
excl_version = Set(read_composition_exclusions(r"# \(2\) Post Composition Version precomposed characters.*?# Total code points:"s))
#-------------------------------------------------------------------------------
function read_case_folding(filename)
case_folding = Dict{UInt32,Vector{UInt32}}()
for line in readlines(filename)
m = match(r"^([0-9A-F]+); [CF]; ([0-9A-F ]+);"i, line)
!isnothing(m) || continue
case_folding[parsehex(m[1])] = parsehex.(split(m[2]))
end
case_folding
end
let case_folding = read_case_folding("CaseFolding.txt")
global function get_case_folding(code)
get(case_folding, code, nothing)
end
end
#-------------------------------------------------------------------------------
# Utilities for reading per-char properties from UnicodeData.txt
function split_unicode_data_line(line)
m = match(r"""
([0-9A-F]+); # code
([^;]+); # name
([A-Z]+); # general category
([0-9]+); # canonical combining class
([A-Z]+); # bidi class
(<([A-Z]*)>)? # decomposition type
((\ ?[0-9A-F]+)*); # decompomposition mapping
([0-9]*); # decimal digit
([0-9]*); # digit
([^;]*); # numeric
([YN]*); # bidi mirrored
([^;]*); # unicode 1.0 name
([^;]*); # iso comment
([0-9A-F]*); # simple uppercase mapping
([0-9A-F]*); # simple lowercase mapping
([0-9A-F]*)$ # simple titlecase mapping
"""ix, line)
@assert !isnothing(m)
code = parse(UInt32, m[1], base=16)
(code = code,
name = m[2],
category = m[3],
combining_class = parse(Int, m[4]),
bidi_class = m[5],
decomp_type = m[7],
decomp_mapping = m[8] == "" ? nothing : parsehex.(split(m[8])),
bidi_mirrored = m[13] == "Y",
# issue #130: use nonstandard uppercase ß -> ẞ
# issue #195: if character is uppercase but has no lowercase mapping,
# then make lowercase mapping = itself (vice versa for lowercase)
uppercase_mapping = m[16] != "" ? parsehex(m[16]) :
code == 0x000000df ? 0x00001e9e :
m[17] == "" && code in lowercase ? code :
nothing,
lowercase_mapping = m[17] != "" ? parsehex(m[17]) :
m[16] == "" && code in uppercase ? code :
nothing,
titlecase_mapping = m[18] != "" ? parsehex(m[18]) :
code == 0x000000df ? 0x00001e9e :
nothing,
)
end
function read_unicode_data(filename)
raw_char_props = split_unicode_data_line.(readlines(filename))
char_props = Origin(0)(Vector{eltype(raw_char_props)}())
@assert issorted(raw_char_props, by=c->c.code)
raw_char_props = Iterators.Stateful(raw_char_props)
while !isempty(raw_char_props)
c = popfirst!(raw_char_props)
if occursin(", First>", c.name)
nc = popfirst!(raw_char_props)
@assert occursin(", Last>", nc.name)
name = replace(c.name, ", First"=>"")
for i in c.code:nc.code
push!(char_props, (; c..., name=name, code=i))
end
else
push!(char_props, c)
end
end
return char_props
end
char_props = read_unicode_data("UnicodeData.txt")
char_hash = Dict(c.code=>c for c in char_props)
#-------------------------------------------------------------------------------
# Read character widths from UAX #11: East Asian Width
function read_east_asian_widths(filename)
ea_widths = Dict{UInt32,Int}()
for (rng,widthcode) in read_hex_ranges(filename)
w = widthcode == "W" || widthcode == "F" ? 2 : # wide or full
widthcode == "Na"|| widthcode == "H" ? 1 : # narrow or half-width
widthcode == "A" ? -1 : # ambiguous width
nothing
if !isnothing(w)
set_all!(ea_widths, rng, w)
end
end
return ea_widths
end
let ea_widths = read_east_asian_widths("EastAsianWidth.txt")
# Following work by @jiahao, we compute character widths using a combination of
# * character category
# * UAX 11: East Asian Width
# * a few exceptions as needed
# Adapted from http://nbviewer.ipython.org/gist/jiahao/07e8b08bf6d8671e9734
global function derive_char_width(code, category)
# Use a default width of 1 for all character categories that are
# letter/symbol/number-like, as well as for unassigned/private-use chars.
# This provides a useful nonzero fallback for new codepoints when a new
# Unicode version has been released.
width = 1
# Various zero-width categories
#
# "Sk" not included in zero width - see issue #167
if category in ("Mn", "Mc", "Me", "Zl", "Zp", "Cc", "Cf", "Cs")
width = 0
end
# Widths from UAX #11: East Asian Width
eaw = get(ea_widths, code, nothing)
if !isnothing(eaw)
width = eaw < 0 ? 1 : eaw
end
# A few exceptional cases, found by manual comparison to other wcwidth
# functions and similar checks.
if category == "Mn"
width = 0
end
if code == 0x00ad
# Soft hyphen is typically printed as a hyphen (-) in terminals.
width = 1
elseif code == 0x2028 || code == 0x2029
#By definition, should have zero width (on the same line)
#0x002028 '\u2028' category: Zl name: LINE SEPARATOR/
#0x002029 '\u2029' category: Zp name: PARAGRAPH SEPARATOR/
width = 0
end
return width
end
global function is_ambiguous_width(code)
return get(ea_widths, code, 0) < 0
end
end
#-------------------------------------------------------------------------------
# Construct data tables which will drive libutf8proc
#
# These tables are "compressed" with an ad-hoc compression scheme (largely some
# simple deduplication and indexing) which can easily and efficiently be
# decompressed on the C side at runtime.
# Inverse decomposition mapping tables for combining two characters into a single one.
comb_mapping = Dict{UInt32, Dict{UInt32, UInt32}}()
comb_issecond = Set{UInt32}()
for char in char_props
# What happens with decompositions that are longer than 2?
if isnothing(char.decomp_type) && !isnothing(char.decomp_mapping) &&
length(char.decomp_mapping) == 2 && !isnothing(char_hash[char.decomp_mapping[1]]) &&
char_hash[char.decomp_mapping[1]].combining_class == 0 &&
(char.code exclusions && char.code excl_version)
dm0 = char.decomp_mapping[1]
dm1 = char.decomp_mapping[2]
if !haskey(comb_mapping, dm0)
comb_mapping[dm0] = Dict{UInt32, UInt32}()
end
comb_mapping[dm0][dm1] = char.code
push!(comb_issecond, dm1)
end
end
comb_index = Dict{UInt32, UInt32}()
comb_length = Dict{UInt32, UInt32}()
let
ind = 0
for dm0 in sort!(collect(keys(comb_mapping)))
comb_index[dm0] = ind
len = length(comb_mapping[dm0])
comb_length[dm0] = len
ind += len
end
end
utf16_encode(utf32_seq) = transcode(UInt16, transcode(String, utf32_seq))
# Utility for packing all UTF-16 encoded sequences into one big array
struct UTF16Sequences
storage::Vector{UInt16}
indices::Dict{Vector{UInt16},Int}
end
UTF16Sequences() = UTF16Sequences(UInt16[], Dict{Vector{UInt16},Int}())
"""
Return "sequence code" (seqindex in the C code) for a sequence: a UInt16 where
* The 14 low bits are the index into the `sequences.storage` array where the
sequence resides
* The two top bits are the length of the sequence, or if equal to 3, the first
entry of the sequence itself contains the length.
"""
function encode_sequence!(sequences::UTF16Sequences, utf32_seq::Vector)
if length(utf32_seq) == 0
return typemax(UInt16)
end
# lencode contains the length of the UTF-32 sequence after decoding
# No sequence has len 0, so we encode len 1 as 0, len 2 as 1.
# We have only 2 bits for the length, though, so longer sequences are
# encoded in the sequence data itself.
seq_lencode = length(utf32_seq) - 1
utf16_seq = utf16_encode(utf32_seq)
idx = get!(sequences.indices, utf16_seq) do
i = length(sequences.storage)
utf16_seq_enc = seq_lencode < 3 ? utf16_seq :
pushfirst!(copy(utf16_seq), seq_lencode)
append!(sequences.storage, utf16_seq_enc)
i
end
@assert idx <= 0x3FFF
seq_code = idx | (min(seq_lencode, 3) << 14)
return seq_code
end
function encode_sequence!(sequences::UTF16Sequences, code::Integer)
encode_sequence!(sequences, [code])
end
function encode_sequence!(sequences::UTF16Sequences, ::Nothing)
return typemax(UInt16)
end
function char_table_properties!(sequences, char)
code = char.code
return (
category = char.category,
combining_class = char.combining_class,
bidi_class = char.bidi_class,
decomp_type = char.decomp_type,
decomp_seqindex = encode_sequence!(sequences, char.decomp_mapping),
casefold_seqindex = encode_sequence!(sequences, get_case_folding(code)),
uppercase_seqindex = encode_sequence!(sequences, char.uppercase_mapping),
lowercase_seqindex = encode_sequence!(sequences, char.lowercase_mapping),
titlecase_seqindex = encode_sequence!(sequences, char.titlecase_mapping),
comb_index = get(comb_index, code, 0x3FF), # see utf8proc_property_struct::comb_index
comb_length = get(comb_length, code, 0),
comb_issecond = code in comb_issecond,
bidi_mirrored = char.bidi_mirrored,
comp_exclusion = code in exclusions || code in excl_version,
ignorable = code in ignorable,
control_boundary = char.category in ("Zl", "Zp", "Cc", "Cf") &&
!(char.code in (0x200C, 0x200D)),
charwidth = derive_char_width(code, char.category),
ambiguous_width = is_ambiguous_width(code),
boundclass = get_grapheme_boundclass(code),
indic_conjunct_break = get_indic_conjunct_break(code),
)
end
# Many character properties are duplicates. Deduplicate them, constructing a
# per-character array of indicies into the properties array
sequences = UTF16Sequences()
char_table_props = [char_table_properties!(sequences, cp) for cp in char_props]
deduplicated_props = Origin(0)(Vector{eltype(char_table_props)}())
char_property_indices = Origin(0)(zeros(Int, 0x00110000))
let index_map = Dict{eltype(char_table_props),Int}()
for (char, table_props) in zip(char_props, char_table_props)
entry_idx = get!(index_map, table_props) do
idx = length(deduplicated_props)
push!(deduplicated_props, table_props)
idx
end
# Add 1 because unassigned codes occupy slot at index 0
char_property_indices[char.code] = entry_idx + 1
end
end
# Now compress char_property_indices by breaking it into pages and
# deduplicating those (this works as compression because there are large
# contiguous ranges of code space with identical properties)
prop_page_indices = Int[]
prop_pages = Int[]
let
page_size = 0x100
page_index_map = Dict{Vector{Int}, Int}()
for page in Iterators.partition(char_property_indices, page_size)
page_idx = get!(page_index_map, page) do
idx = length(prop_pages)
append!(prop_pages, page)
idx
end
push!(prop_page_indices, page_idx)
end
end
#-------------------------------------------------------------------------------
function write_c_index_array(io, array, linelen)
print(io, "{\n ")
i = 0
for x in array
i += 1
if i == linelen
i = 0
print(io, "\n ")
end
print(io, x, ", ")
end
print(io, "};\n\n")
end
function c_enum_name(prefix, str)
if isnothing(str)
return "0"
else
return "UTF8PROC_$(prefix)_$(Base.uppercase(str))"
end
end
function c_uint16(seqindex)
if seqindex == typemax(UInt16)
return "UINT16_MAX"
else
return string(seqindex)
end
end
function print_c_data_tables(io, sequences, prop_page_indices, prop_pages, deduplicated_props,
comb_index, comb_length, comb_issecond)
print(io, "static const utf8proc_uint16_t utf8proc_sequences[] = ")
write_c_index_array(io, sequences.storage, 8)
print(io, "static const utf8proc_uint16_t utf8proc_stage1table[] = ")
write_c_index_array(io, prop_page_indices, 8)
print(io, "static const utf8proc_uint16_t utf8proc_stage2table[] = ")
write_c_index_array(io, prop_pages, 8)
print(io, """
static const utf8proc_property_t utf8proc_properties[] = {
{0, 0, 0, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, UINT16_MAX, 0x3FF,0,false, false,false,false,false, 1, 0, 0, UTF8PROC_BOUNDCLASS_OTHER, UTF8PROC_INDIC_CONJUNCT_BREAK_NONE},
""")
for prop in deduplicated_props
print(io, " {",
c_enum_name("CATEGORY", prop.category), ", ",
prop.combining_class, ", ",
c_enum_name("BIDI_CLASS", prop.bidi_class), ", ",
c_enum_name("DECOMP_TYPE", prop.decomp_type), ", ",
c_uint16(prop.decomp_seqindex), ", ",
c_uint16(prop.casefold_seqindex), ", ",
c_uint16(prop.uppercase_seqindex), ", ",
c_uint16(prop.lowercase_seqindex), ", ",
c_uint16(prop.titlecase_seqindex), ", ",
c_uint16(prop.comb_index), ", ",
c_uint16(prop.comb_length), ", ",
prop.comb_issecond, ", ",
prop.bidi_mirrored, ", ",
prop.comp_exclusion, ", ",
prop.ignorable, ", ",
prop.control_boundary, ", ",
prop.charwidth, ", ",
prop.ambiguous_width, ", ",
"0, ", # bitfield padding
c_enum_name("BOUNDCLASS", prop.boundclass), ", ",
c_enum_name("INDIC_CONJUNCT_BREAK", prop.indic_conjunct_break),
"},\n"
)
end
print(io, "};\n\n")
print(io, "static const utf8proc_int32_t utf8proc_combinations_second[] = {\n")
for dm0 in sort!(collect(keys(comb_mapping)))
print(io, " ");
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
print(io, " ", dm1, ",")
end
print(io, "\n");
end
print(io, "};\n\n")
print(io, "static const utf8proc_int32_t utf8proc_combinations_combined[] = {\n")
for dm0 in sort!(collect(keys(comb_mapping)))
print(io, " ");
for dm1 in sort!(collect(keys(comb_mapping[dm0])))
code = comb_mapping[dm0][dm1]
print(io, " ", code, ",")
end
print(io, "\n");
end
print(io, "};\n\n")
end
if !isinteractive()
print_c_data_tables(stdout, sequences, prop_page_indices, prop_pages, deduplicated_props,
comb_index, comb_length, comb_issecond)
end