Jump to content

Module:Unicode data

د ويکيسيند لخوا

Documentation for this module may be created at Module:Unicode data/لاسوند

local export = {}

-- http://www.unicode.org/Public/UNIDATA/Jamo.txt
local hangul_leads = {
	[0] = "G", "GG", "N", "D", "DD", "R", "M", "B", "BB", "S", "SS",
	"", "J", "JJ", "C", "K", "T", "P", "H"
}

local hangul_vowels = {
	[0] = "A", "AE", "YA", "YAE", "EO", "E", "YEO", "YE", "O", "WA",
	"WAE", "OE", "YO", "U", "WEO", "WE", "WI", "YU", "EU", "YI",
	"I"
}

local hangul_trails = {
	[0] = "", "G", "GG", "GS", "N", "NJ", "NH", "D", "L", "LG", "LM", "LB",
	"LS", "LT", "LP", "LH", "M", "B", "BS", "S", "SS", "NG", "J", "C", "K",
	"T", "P", "H"
}

local name_hooks = {
	{     0x00,     0x1f, "<control-%04X>" }, -- C0 control characters
	{     0x80,     0x9f, "<control-%04X>" }, -- C1 control characters
	{   0x3400,   0x4db5, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension A
	{   0x4e00,   0x9fcc, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph
	{   0xac00,   0xd7a3, function (codepoint)
		local m_hangul = require('Module:ko-hangul')
		local li, vi, ti = m_hangul.syllableIndex2JamoIndices(
			codepoint - 0xac00
		)
		
		return ("HANGUL SYLLABLE %s%s%s"):format(
			hangul_leads[li], -- I hate one-based indexing
			hangul_vowels[vi],
			hangul_trails[ti]  -- never mind, I can live with it
		)
	end },
	{   0xd800,   0xdb7f, "<surrogate-%04X>" }, -- Non Private Use High Surrogate
	{   0xdb80,   0xdbff, "<surrogate-%04X>" }, -- Private Use High Surrogate
	{   0xdc00,   0xdfff, "<surrogate-%04X>" }, -- Low Surrogate
	{   0xe000,   0xf8ff, "<private-use-%04X>" }, -- Private Use
	{  0x20000,  0x2a6d6, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension B
	{  0x2a700,  0x2b734, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension C
	{  0x2a740,  0x2b81d, "CJK UNIFIED IDEOGRAPH-%04X" }, -- CJK Ideograph Extension D
	{  0x2f800,  0x2fa1d, "CJK COMPATIBILITY IDEOGRAPH-%05X" }, -- CJK Compatibility Ideographs Supplement (Supplementary Ideographic Plane)
	{  0xf0000,  0xffffd, "<private-use-%05X>" }, -- Plane 15 Private Use
	{ 0x100000, 0x10fffd, "<private-use-%06X>" }  -- Plane 16 Private Use
}

local noncharacters = {
	[  0xfdd0] = true, [  0xfdd1] = true,
	[  0xfdd2] = true, [  0xfdd3] = true,
	[  0xfdd4] = true, [  0xfdd5] = true,
	[  0xfdd6] = true, [  0xfdd7] = true,
	[  0xfdd8] = true, [  0xfdd9] = true,
	[  0xfdda] = true, [  0xfddb] = true,
	[  0xfddc] = true, [  0xfddd] = true,
	[  0xfdde] = true, [  0xfddf] = true,
	[  0xfde0] = true, [  0xfde1] = true,
	[  0xfde2] = true, [  0xfde3] = true,
	[  0xfde4] = true, [  0xfde5] = true,
	[  0xfde6] = true, [  0xfde7] = true,
	[  0xfde8] = true, [  0xfde9] = true,
	[  0xfdea] = true, [  0xfdeb] = true,
	[  0xfdec] = true, [  0xfded] = true,
	[  0xfdee] = true, [  0xfdef] = true,
	
	[  0xfffe] = true, [  0xffff] = true,
	[ 0x1fffe] = true, [ 0x1ffff] = true,
	[ 0x2fffe] = true, [ 0x2ffff] = true,
	[ 0x3fffe] = true, [ 0x3ffff] = true,
	[ 0x4fffe] = true, [ 0x4ffff] = true,
	[ 0x5fffe] = true, [ 0x5ffff] = true,
	[ 0x6fffe] = true, [ 0x6ffff] = true,
	[ 0x7fffe] = true, [ 0x7ffff] = true,
	[ 0x8fffe] = true, [ 0x8ffff] = true,
	[ 0x9fffe] = true, [ 0x9ffff] = true,
	[ 0xafffe] = true, [ 0xaffff] = true,
	[ 0xbfffe] = true, [ 0xbffff] = true,
	[ 0xcfffe] = true, [ 0xcffff] = true,
	[ 0xdfffe] = true, [ 0xdffff] = true,
	[ 0xefffe] = true, [ 0xeffff] = true,
	[ 0xffffe] = true, [ 0xfffff] = true,
	[0x10fffe] = true, [0x10ffff] = true
}

local name_range_cache

function export.lookup_name(codepoint)
	if noncharacters[codepoint] then
		return ("<noncharacter-%.4X>"):format(codepoint)
	end
	
	if name_range_cache then
		if (codepoint >= name_range_cache[1]) and (codepoint <= name_range_cache[2]) then
			if type(name_range_cache[3]) == "string" then
				return name_range_cache[3]:format(codepoint)
			else
				return name_range_cache[3](codepoint)
			end
		end
	end

	for _, item in ipairs(name_hooks) do
		if (codepoint >= item[1]) and (codepoint <= item[2]) then
			name_range_cache = item
			if type(item[3]) == "string" then
				return item[3]:format(codepoint)
			else
				return item[3](codepoint)
			end
		elseif codepoint < item[1] then
			break
		end
	end

	local success, data = pcall(mw.loadData,
		('Module:Unicode data/names/%03X'):format(
			math.floor(codepoint / 0x1000)
		)
	)

	return (success and data[codepoint]) or ("<U-%06X>"):format(codepoint)
end

function export.template_lookup_name(frame)
	local codepoint = tonumber(frame.args[1] or frame:getParent().args[1])
	local name = export.lookup_name(codepoint)
	return name:gsub("<", "&lt;")
end

local planes = {
	[ 0] = "Basic Multilingual Plane";
	[ 1] = "Supplementary Multilingual Plane";
	[ 2] = "Supplementary Ideographic Plane";
	[13] = "Supplementary Special-purpose Plane";
	[14] = "Supplement­ary Private Use Area-A";
	[15] = "Supplement­ary Private Use Area-B";
}

-- http://www.unicode.org/Public/UNIDATA/Blocks.txt
local blocks = {
	["Basic Latin"                                     ] = { 0x000000, 0x00007f };
	["Latin-1 Supplement"                              ] = { 0x000080, 0x0000ff };
	["Latin Extended-A"                                ] = { 0x000100, 0x00017f };
	["Latin Extended-B"                                ] = { 0x000180, 0x00024f };
	["IPA Extensions"                                  ] = { 0x000250, 0x0002af };
	["Spacing Modifier Letters"                        ] = { 0x0002b0, 0x0002ff };
	["Combining Diacritical Marks"                     ] = { 0x000300, 0x00036f };
	["Greek and Coptic"                                ] = { 0x000370, 0x0003ff };
	["Cyrillic"                                        ] = { 0x000400, 0x0004ff };
	["Cyrillic Supplement"                             ] = { 0x000500, 0x00052f };
	["Armenian"                                        ] = { 0x000530, 0x00058f };
	["Hebrew"                                          ] = { 0x000590, 0x0005ff };
	["Arabic"                                          ] = { 0x000600, 0x0006ff };
	["Syriac"                                          ] = { 0x000700, 0x00074f };
	["Arabic Supplement"                               ] = { 0x000750, 0x00077f };
	["Thaana"                                          ] = { 0x000780, 0x0007bf };
	["NKo"                                             ] = { 0x0007c0, 0x0007ff };
	["Samaritan"                                       ] = { 0x000800, 0x00083f };
	["Mandaic"                                         ] = { 0x000840, 0x00085f };
	["Arabic Extended-A"                               ] = { 0x0008a0, 0x0008ff };
	["Devanagari"                                      ] = { 0x000900, 0x00097f };
	["Bengali"                                         ] = { 0x000980, 0x0009ff };
	["Gurmukhi"                                        ] = { 0x000a00, 0x000a7f };
	["Gujarati"                                        ] = { 0x000a80, 0x000aff };
	["Oriya"                                           ] = { 0x000b00, 0x000b7f };
	["Tamil"                                           ] = { 0x000b80, 0x000bff };
	["Telugu"                                          ] = { 0x000c00, 0x000c7f };
	["Kannada"                                         ] = { 0x000c80, 0x000cff };
	["Malayalam"                                       ] = { 0x000d00, 0x000d7f };
	["Sinhala"                                         ] = { 0x000d80, 0x000dff };
	["Thai"                                            ] = { 0x000e00, 0x000e7f };
	["Lao"                                             ] = { 0x000e80, 0x000eff };
	["Tibetan"                                         ] = { 0x000f00, 0x000fff };
	["Myanmar"                                         ] = { 0x001000, 0x00109f };
	["Georgian"                                        ] = { 0x0010a0, 0x0010ff };
	["Hangul Jamo"                                     ] = { 0x001100, 0x0011ff };
	["Ethiopic"                                        ] = { 0x001200, 0x00137f };
	["Ethiopic Supplement"                             ] = { 0x001380, 0x00139f };
	["Cherokee"                                        ] = { 0x0013a0, 0x0013ff };
	["Unified Canadian Aboriginal Syllabics"           ] = { 0x001400, 0x00167f };
	["Ogham"                                           ] = { 0x001680, 0x00169f };
	["Runic"                                           ] = { 0x0016a0, 0x0016ff };
	["Tagalog"                                         ] = { 0x001700, 0x00171f };
	["Hanunoo"                                         ] = { 0x001720, 0x00173f };
	["Buhid"                                           ] = { 0x001740, 0x00175f };
	["Tagbanwa"                                        ] = { 0x001760, 0x00177f };
	["Khmer"                                           ] = { 0x001780, 0x0017ff };
	["Mongolian"                                       ] = { 0x001800, 0x0018af };
	["Unified Canadian Aboriginal Syllabics Extended"  ] = { 0x0018b0, 0x0018ff };
	["Limbu"                                           ] = { 0x001900, 0x00194f };
	["Tai Le"                                          ] = { 0x001950, 0x00197f };
	["New Tai Lue"                                     ] = { 0x001980, 0x0019df };
	["Khmer Symbols"                                   ] = { 0x0019e0, 0x0019ff };
	["Buginese"                                        ] = { 0x001a00, 0x001a1f };
	["Tai Tham"                                        ] = { 0x001a20, 0x001aaf };
	["Combining Diacritical Marks Extended"            ] = { 0x001ab0, 0x001aff };
	["Balinese"                                        ] = { 0x001b00, 0x001b7f };
	["Sundanese"                                       ] = { 0x001b80, 0x001bbf };
	["Batak"                                           ] = { 0x001bc0, 0x001bff };
	["Lepcha"                                          ] = { 0x001c00, 0x001c4f };
	["Ol Chiki"                                        ] = { 0x001c50, 0x001c7f };
	["Sundanese Supplement"                            ] = { 0x001cc0, 0x001ccf };
	["Vedic Extensions"                                ] = { 0x001cd0, 0x001cff };
	["Phonetic Extensions"                             ] = { 0x001d00, 0x001d7f };
	["Phonetic Extensions Supplement"                  ] = { 0x001d80, 0x001dbf };
	["Combining Diacritical Marks Supplement"          ] = { 0x001dc0, 0x001dff };
	["Latin Extended Additional"                       ] = { 0x001e00, 0x001eff };
	["Greek Extended"                                  ] = { 0x001f00, 0x001fff };
	["General Punctuation"                             ] = { 0x002000, 0x00206f };
	["Superscripts and Subscripts"                     ] = { 0x002070, 0x00209f };
	["Currency Symbols"                                ] = { 0x0020a0, 0x0020cf };
	["Combining Diacritical Marks for Symbols"         ] = { 0x0020d0, 0x0020ff };
	["Letterlike Symbols"                              ] = { 0x002100, 0x00214f };
	["Number Forms"                                    ] = { 0x002150, 0x00218f };
	["Arrows"                                          ] = { 0x002190, 0x0021ff };
	["Mathematical Operators"                          ] = { 0x002200, 0x0022ff };
	["Miscellaneous Technical"                         ] = { 0x002300, 0x0023ff };
	["Control Pictures"                                ] = { 0x002400, 0x00243f };
	["Optical Character Recognition"                   ] = { 0x002440, 0x00245f };
	["Enclosed Alphanumerics"                          ] = { 0x002460, 0x0024ff };
	["Box Drawing"                                     ] = { 0x002500, 0x00257f };
	["Block Elements"                                  ] = { 0x002580, 0x00259f };
	["Geometric Shapes"                                ] = { 0x0025a0, 0x0025ff };
	["Miscellaneous Symbols"                           ] = { 0x002600, 0x0026ff };
	["Dingbats"                                        ] = { 0x002700, 0x0027bf };
	["Miscellaneous Mathematical Symbols-A"            ] = { 0x0027c0, 0x0027ef };
	["Supplemental Arrows-A"                           ] = { 0x0027f0, 0x0027ff };
	["Braille Patterns"                                ] = { 0x002800, 0x0028ff };
	["Supplemental Arrows-B"                           ] = { 0x002900, 0x00297f };
	["Miscellaneous Mathematical Symbols-B"            ] = { 0x002980, 0x0029ff };
	["Supplemental Mathematical Operators"             ] = { 0x002a00, 0x002aff };
	["Miscellaneous Symbols and Arrows"                ] = { 0x002b00, 0x002bff };
	["Glagolitic"                                      ] = { 0x002c00, 0x002c5f };
	["Latin Extended-C"                                ] = { 0x002c60, 0x002c7f };
	["Coptic"                                          ] = { 0x002c80, 0x002cff };
	["Georgian Supplement"                             ] = { 0x002d00, 0x002d2f };
	["Tifinagh"                                        ] = { 0x002d30, 0x002d7f };
	["Ethiopic Extended"                               ] = { 0x002d80, 0x002ddf };
	["Cyrillic Extended-A"                             ] = { 0x002de0, 0x002dff };
	["Supplemental Punctuation"                        ] = { 0x002e00, 0x002e7f };
	["CJK Radicals Supplement"                         ] = { 0x002e80, 0x002eff };
	["Kangxi Radicals"                                 ] = { 0x002f00, 0x002fdf };
	["Ideographic Description Characters"              ] = { 0x002ff0, 0x002fff };
	["CJK Symbols and Punctuation"                     ] = { 0x003000, 0x00303f };
	["Hiragana"                                        ] = { 0x003040, 0x00309f };
	["Katakana"                                        ] = { 0x0030a0, 0x0030ff };
	["Bopomofo"                                        ] = { 0x003100, 0x00312f };
	["Hangul Compatibility Jamo"                       ] = { 0x003130, 0x00318f };
	["Kanbun"                                          ] = { 0x003190, 0x00319f };
	["Bopomofo Extended"                               ] = { 0x0031a0, 0x0031bf };
	["CJK Strokes"                                     ] = { 0x0031c0, 0x0031ef };
	["Katakana Phonetic Extensions"                    ] = { 0x0031f0, 0x0031ff };
	["Enclosed CJK Letters and Months"                 ] = { 0x003200, 0x0032ff };
	["CJK Compatibility"                               ] = { 0x003300, 0x0033ff };
	["CJK Unified Ideographs Extension A"              ] = { 0x003400, 0x004dbf };
	["Yijing Hexagram Symbols"                         ] = { 0x004dc0, 0x004dff };
	["CJK Unified Ideographs"                          ] = { 0x004e00, 0x009fff };
	["Yi Syllables"                                    ] = { 0x00a000, 0x00a48f };
	["Yi Radicals"                                     ] = { 0x00a490, 0x00a4cf };
	["Lisu"                                            ] = { 0x00a4d0, 0x00a4ff };
	["Vai"                                             ] = { 0x00a500, 0x00a63f };
	["Cyrillic Extended-B"                             ] = { 0x00a640, 0x00a69f };
	["Bamum"                                           ] = { 0x00a6a0, 0x00a6ff };
	["Modifier Tone Letters"                           ] = { 0x00a700, 0x00a71f };
	["Latin Extended-D"                                ] = { 0x00a720, 0x00a7ff };
	["Syloti Nagri"                                    ] = { 0x00a800, 0x00a82f };
	["Common Indic Number Forms"                       ] = { 0x00a830, 0x00a83f };
	["Phags-pa"                                        ] = { 0x00a840, 0x00a87f };
	["Saurashtra"                                      ] = { 0x00a880, 0x00a8df };
	["Devanagari Extended"                             ] = { 0x00a8e0, 0x00a8ff };
	["Kayah Li"                                        ] = { 0x00a900, 0x00a92f };
	["Rejang"                                          ] = { 0x00a930, 0x00a95f };
	["Hangul Jamo Extended-A"                          ] = { 0x00a960, 0x00a97f };
	["Javanese"                                        ] = { 0x00a980, 0x00a9df };
	["Myanmar Extended-B"                              ] = { 0x00a9e0, 0x00a9ff };
	["Cham"                                            ] = { 0x00aa00, 0x00aa5f };
	["Myanmar Extended-A"                              ] = { 0x00aa60, 0x00aa7f };
	["Tai Viet"                                        ] = { 0x00aa80, 0x00aadf };
	["Meetei Mayek Extensions"                         ] = { 0x00aae0, 0x00aaff };
	["Ethiopic Extended-A"                             ] = { 0x00ab00, 0x00ab2f };
	["Latin Extended-E"                                ] = { 0x00ab30, 0x00ab6f };
	["Meetei Mayek"                                    ] = { 0x00abc0, 0x00abff };
	["Hangul Syllables"                                ] = { 0x00ac00, 0x00d7af };
	["Hangul Jamo Extended-B"                          ] = { 0x00d7b0, 0x00d7ff };
	["High Surrogates"                                 ] = { 0x00d800, 0x00db7f };
	["High Private Use Surrogates"                     ] = { 0x00db80, 0x00dbff };
	["Low Surrogates"                                  ] = { 0x00dc00, 0x00dfff };
	["Private Use Area"                                ] = { 0x00e000, 0x00f8ff };
	["CJK Compatibility Ideographs"                    ] = { 0x00f900, 0x00faff };
	["Alphabetic Presentation Forms"                   ] = { 0x00fb00, 0x00fb4f };
	["Arabic Presentation Forms-A"                     ] = { 0x00fb50, 0x00fdff };
	["Variation Selectors"                             ] = { 0x00fe00, 0x00fe0f };
	["Vertical Forms"                                  ] = { 0x00fe10, 0x00fe1f };
	["Combining Half Marks"                            ] = { 0x00fe20, 0x00fe2f };
	["CJK Compatibility Forms"                         ] = { 0x00fe30, 0x00fe4f };
	["Small Form Variants"                             ] = { 0x00fe50, 0x00fe6f };
	["Arabic Presentation Forms-B"                     ] = { 0x00fe70, 0x00feff };
	["Halfwidth and Fullwidth Forms"                   ] = { 0x00ff00, 0x00ffef };
	["Specials"                                        ] = { 0x00fff0, 0x00ffff };
	["Linear B Syllabary"                              ] = { 0x010000, 0x01007f };
	["Linear B Ideograms"                              ] = { 0x010080, 0x0100ff };
	["Aegean Numbers"                                  ] = { 0x010100, 0x01013f };
	["Ancient Greek Numbers"                           ] = { 0x010140, 0x01018f };
	["Ancient Symbols"                                 ] = { 0x010190, 0x0101cf };
	["Phaistos Disc"                                   ] = { 0x0101d0, 0x0101ff };
	["Lycian"                                          ] = { 0x010280, 0x01029f };
	["Carian"                                          ] = { 0x0102a0, 0x0102df };
	["Coptic Epact Numbers"                            ] = { 0x0102e0, 0x0102ff };
	["Old Italic"                                      ] = { 0x010300, 0x01032f };
	["Gothic"                                          ] = { 0x010330, 0x01034f };
	["Old Permic"                                      ] = { 0x010350, 0x01037f };
	["Ugaritic"                                        ] = { 0x010380, 0x01039f };
	["Old Persian"                                     ] = { 0x0103a0, 0x0103df };
	["Deseret"                                         ] = { 0x010400, 0x01044f };
	["Shavian"                                         ] = { 0x010450, 0x01047f };
	["Osmanya"                                         ] = { 0x010480, 0x0104af };
	["Elbasan"                                         ] = { 0x010500, 0x01052f };
	["Caucasian Albanian"                              ] = { 0x010530, 0x01056f };
	["Linear A"                                        ] = { 0x010600, 0x01077f };
	["Cypriot Syllabary"                               ] = { 0x010800, 0x01083f };
	["Imperial Aramaic"                                ] = { 0x010840, 0x01085f };
	["Palmyrene"                                       ] = { 0x010860, 0x01087f };
	["Nabataean"                                       ] = { 0x010880, 0x0108af };
	["Phoenician"                                      ] = { 0x010900, 0x01091f };
	["Lydian"                                          ] = { 0x010920, 0x01093f };
	["Meroitic Hieroglyphs"                            ] = { 0x010980, 0x01099f };
	["Meroitic Cursive"                                ] = { 0x0109a0, 0x0109ff };
	["Kharoshthi"                                      ] = { 0x010a00, 0x010a5f };
	["Old South Arabian"                               ] = { 0x010a60, 0x010a7f };
	["Old North Arabian"                               ] = { 0x010a80, 0x010a9f };
	["Manichaean"                                      ] = { 0x010ac0, 0x010aff };
	["Avestan"                                         ] = { 0x010b00, 0x010b3f };
	["Inscriptional Parthian"                          ] = { 0x010b40, 0x010b5f };
	["Inscriptional Pahlavi"                           ] = { 0x010b60, 0x010b7f };
	["Psalter Pahlavi"                                 ] = { 0x010b80, 0x010baf };
	["Old Turkic"                                      ] = { 0x010c00, 0x010c4f };
	["Rumi Numeral Symbols"                            ] = { 0x010e60, 0x010e7f };
	["Brahmi"                                          ] = { 0x011000, 0x01107f };
	["Kaithi"                                          ] = { 0x011080, 0x0110cf };
	["Sora Sompeng"                                    ] = { 0x0110d0, 0x0110ff };
	["Chakma"                                          ] = { 0x011100, 0x01114f };
	["Mahajani"                                        ] = { 0x011150, 0x01117f };
	["Sharada"                                         ] = { 0x011180, 0x0111df };
	["Sinhala Archaic Numbers"                         ] = { 0x0111e0, 0x0111ff };
	["Khojki"                                          ] = { 0x011200, 0x01124f };
	["Khudawadi"                                       ] = { 0x0112b0, 0x0112ff };
	["Grantha"                                         ] = { 0x011300, 0x01137f };
	["Tirhuta"                                         ] = { 0x011480, 0x0114df };
	["Siddham"                                         ] = { 0x011580, 0x0115ff };
	["Modi"                                            ] = { 0x011600, 0x01165f };
	["Takri"                                           ] = { 0x011680, 0x0116cf };
	["Warang Citi"                                     ] = { 0x0118a0, 0x0118ff };
	["Pau Cin Hau"                                     ] = { 0x011ac0, 0x011aff };
	["Cuneiform"                                       ] = { 0x012000, 0x0123ff };
	["Cuneiform Numbers and Punctuation"               ] = { 0x012400, 0x01247f };
	["Egyptian Hieroglyphs"                            ] = { 0x013000, 0x01342f };
	["Bamum Supplement"                                ] = { 0x016800, 0x016a3f };
	["Mro"                                             ] = { 0x016a40, 0x016a6f };
	["Bassa Vah"                                       ] = { 0x016ad0, 0x016aff };
	["Pahawh Hmong"                                    ] = { 0x016b00, 0x016b8f };
	["Miao"                                            ] = { 0x016f00, 0x016f9f };
	["Kana Supplement"                                 ] = { 0x01b000, 0x01b0ff };
	["Duployan"                                        ] = { 0x01bc00, 0x01bc9f };
	["Shorthand Format Controls"                       ] = { 0x01bca0, 0x01bcaf };
	["Byzantine Musical Symbols"                       ] = { 0x01d000, 0x01d0ff };
	["Musical Symbols"                                 ] = { 0x01d100, 0x01d1ff };
	["Ancient Greek Musical Notation"                  ] = { 0x01d200, 0x01d24f };
	["Tai Xuan Jing Symbols"                           ] = { 0x01d300, 0x01d35f };
	["Counting Rod Numerals"                           ] = { 0x01d360, 0x01d37f };
	["Mathematical Alphanumeric Symbols"               ] = { 0x01d400, 0x01d7ff };
	["Mende Kikakui"                                   ] = { 0x01e800, 0x01e8df };
	["Arabic Mathematical Alphabetic Symbols"          ] = { 0x01ee00, 0x01eeff };
	["Mahjong Tiles"                                   ] = { 0x01f000, 0x01f02f };
	["Domino Tiles"                                    ] = { 0x01f030, 0x01f09f };
	["Playing Cards"                                   ] = { 0x01f0a0, 0x01f0ff };
	["Enclosed Alphanumeric Supplement"                ] = { 0x01f100, 0x01f1ff };
	["Enclosed Ideographic Supplement"                 ] = { 0x01f200, 0x01f2ff };
	["Miscellaneous Symbols and Pictographs"           ] = { 0x01f300, 0x01f5ff };
	["Emoticons"                                       ] = { 0x01f600, 0x01f64f };
	["Ornamental Dingbats"                             ] = { 0x01f650, 0x01f67f };
	["Transport and Map Symbols"                       ] = { 0x01f680, 0x01f6ff };
	["Alchemical Symbols"                              ] = { 0x01f700, 0x01f77f };
	["Geometric Shapes Extended"                       ] = { 0x01f780, 0x01f7ff };
	["Supplemental Arrows-C"                           ] = { 0x01f800, 0x01f8ff };
	["CJK Unified Ideographs Extension B"              ] = { 0x020000, 0x02a6df };
	["CJK Unified Ideographs Extension C"              ] = { 0x02a700, 0x02b73f };
	["CJK Unified Ideographs Extension D"              ] = { 0x02b740, 0x02b81f };
	["CJK Compatibility Ideographs Supplement"         ] = { 0x02f800, 0x02fa1f };
	["Tags"                                            ] = { 0x0e0000, 0x0e007f };
	["Variation Selectors Supplement"                  ] = { 0x0e0100, 0x0e01ef };
	["Supplementary Private Use Area-A"                ] = { 0x0f0000, 0x0fffff };
	["Supplementary Private Use Area-B"                ] = { 0x100000, 0x10ffff };
}

function export.enum_blocks()
	local list = {}

	for name, range in pairs(blocks) do
		table.insert(list, { name, range[1], range[2] })
	end
	table.sort(list, function (apple, orange)
		return apple[2] < orange[2]
	end)
	
	return function (list, i)
		local data = list[i + 1]
		if not data then
			return nil	
		end
		return i + 1, data[1], data[2], data[3]
	end, list, 0
end

function export.lookup_plane(codepoint)
	local i = math.floor(codepoint / 0x10000)
	return planes[i] or ("Plane %u"):format(i)
end

function export.lookup_block(codepoint)
	for name, range in pairs(blocks) do
		if (codepoint >= range[1]) and (codepoint <= range[2]) then
			return name
		end
	end
end

function export.get_block_range(name)
	local range = blocks[name]
	if range then
		return range[1], range[2]
	end
end

function export.is_valid_pagename(pagename)
	local has_nonws = false
	
	for cp in mw.ustring.gcodepoint(pagename) do
		if false
		or (cp == 0x0023) -- #
		or (cp == 0x005b) -- [
		or (cp == 0x005d) -- ]
		or (cp == 0x007b) -- {
		or (cp == 0x007c) -- |
		or (cp == 0x007d) -- }
		or (cp == 0x180e) -- MONGOLIAN VOWEL SEPARATOR
		or ((cp >= 0x2000) and (cp <= 0x200a))
		or (cp == 0xfffd) -- REPLACEMENT CHARACTER
		then
			return false	
		end
		
		local printable, result = export.is_printable(cp)
		if not printable then
			return false
		end
		
		if result ~= "space-separator" then
			has_nonws = true	
		end
	end

	return has_nonws
end

local function manual_unpack(what, from)
	local result = {}
	from = from or 1
	for i, item in ipairs(what) do
		if i >= from then
			table.insert(result, item)
		end
	end
	return unpack(result)
end

local function memo_lookup(loader, match_func, ...)
	local dots = { ... }
	local cache = {}
	local singles, ranges
	
	return function (codepoint)
		if not singles then
			singles, ranges = loader()	
		end
		
		if singles[codepoint] then
			return match_func(codepoint, singles[codepoint])	
		end
		
		local lastlast = -1
		for _, range in pairs(cache) do
			if (range[1] <= codepoint) and (codepoint <= range[2]) then
				return match_func(codepoint, unpack(range, 3))
			end
		end
	
		for _, range in pairs(ranges) do
			if (range[1] <= codepoint) and (codepoint <= range[2]) then
				table.insert(cache, { manual_unpack(range) })
				return match_func(codepoint, manual_unpack(range, 3))
			elseif codepoint < range[1] then
				table.insert(cache, { lastlast + 1, range[1] - 1, unpack(dots) })
				return match_func(codepoint, unpack(dots))
			else
				lastlast = range[2]
			end
		end
	
		return match_func(codepoint)
	end
end

export.is_combining = memo_lookup(function ()
	local m_comb = mw.loadData('Module:Unicode data/combining')
	return m_comb.single, m_comb.ranges
end, function (codepoint, cc)
	return cc and (cc ~= 0)
end, 0)

local lookup_control = memo_lookup(function ()
	local m_cc = mw.loadData('Module:Unicode data/control')
	return m_cc.single, m_cc.ranges
end, function (codepoint, ccc)
	return ccc or "assigned"
end, "assigned")

function export.is_assigned(codepoint)
	return lookup_control(codepoint) ~= "unassigned"
end

function export.is_printable(codepoint)
	local result = lookup_control(codepoint)
	return (result == "assigned") or (result == "space-separator"), result
end

function export.is_whitespace(codepoint)
	local result = lookup_control(codepoint)
	return (result == "space-separator"), result
end

-- to be used in language-neutral context only (e.g. character lists)

local script_pats

local script_blacklist = {
	["Latf"] = true;
	["Hans"] = true;
	["Hant"] = true;
	["Kore"] = true;
	["Jpan"] = true;
}

local script_cache = {}

function export.get_script(codepoint)
	local text = mw.ustring.char(codepoint)

	for pat, sc in pairs(script_cache) do
		if mw.ustring.match(text, pat) then
			return sc
		end
	end
 
	if not script_pats then
		local m_scripts = mw.loadData("Module:scripts/data")
		script_pats = {}
		for sc, info in pairs(m_scripts) do
			if info.characters and not script_blacklist[sc] then
				script_pats[sc] = "[" .. info.characters .. "]"
			end
		end
	end

	for sc, pat in pairs(script_pats) do
		if mw.ustring.match(text, pat) then
			script_cache[pat] = sc
			return sc
		end
	end
 
	return "Zyyy"
end

local unsupported_title = {
	[0x0020] = "Unsupported titles/Space";
	[0x0023] = "Unsupported titles/Number sign";
	[0x002e] = "Unsupported titles/Full stop";
	[0x003a] = "Unsupported titles/Colon";
	[0x003c] = "Unsupported titles/Less than sign";
	[0x003e] = "Unsupported titles/Greater than sign";
	[0x005b] = "Unsupported titles/Left square bracket";
	[0x005d] = "Unsupported titles/Right square bracket";
	[0x005f] = "Unsupported titles/Low line";
	[0x007b] = "Unsupported titles/Left curly bracket";
	[0x007c] = "Unsupported titles/Vertical line";
	[0x007d] = "Unsupported titles/Right curly bracket";
	[0x1680] = "Unsupported titles/Ogham space";
	[0xfffd] = "Unsupported titles/Replacement character";
}

function export.get_entry_title(codepoint)
	if unsupported_title[codepoint] then
		return unsupported_title[codepoint]
	end
	if lookup_control(codepoint) ~= "assigned" then
		return nil
	end
	return mw.ustring.char(codepoint)
end

return export