local export = {}
local kanji_pattern = "々一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𰀀-"
-- returns an array of possible matches between kanji and kana
-- for example, simple_match('物の哀れ', 'もののあわれ') returns { '[物](も)の[哀](のあわ)れ', '[物](もの)の[哀](あわ)れ' }
local function simple_match(kanji, kana)
local kanji_segments = mw.ustring.gsub(kanji, "([A-Za-z0-9々一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𰀀-0-9A-Za-z]+)", "`%1`")
local function simple_match_rec(kanji_segments, kana)
if kanji_segments:find('`') then
local kana_portion, kanji_portion, rest = mw.ustring.match(kanji_segments, '(.-)`(.-)`(.*)')
_, _, kana = mw.ustring.find(kana, '^' .. kana_portion .. '(.*)')
if not kana then return {} end
local candidates = {}
for i = 1, mw.ustring.len(kana) do
for _, candidate in ipairs(simple_match_rec(rest, mw.ustring.sub(kana, i + 1))) do
table.insert(candidates, kana_portion .. '[' .. kanji_portion .. '](' .. mw.ustring.sub(kana, 1, i) .. ')' .. candidate)
end
end
return candidates
else
return (kanji_segments == kana) and { kana } or {}
end
end
return simple_match_rec(kanji_segments, kana)
end
function export.simple_match(kanji, kana)
local simple_results = simple_match(kanji, kana)
return #simple_results == 1 and simple_results[1] or '[' .. kanji .. '](' .. kana .. ')'
end
-- transcludes the entry, and returns an array of its kanjitabs transformed into the format above
-- for example, extract_kanjitab_from_entry('書留') returns { '[書](かき)[留](とめ)' }
-- if the 書留 entry contains {{ja-kanjitab|か|と|o1=き|o2=め|yomi=k}}
local function extract_kanjitab_from_entry(entry_title)
local entry_wikicode = mw.title.new(entry_title):getContent() or ''
local results = {}
for kanjitab in mw.ustring.gmatch(entry_wikicode, '{{ja%-kanjitab|(.-)}}') do
kanjitab = mw.ustring.gsub(kanjitab, '%[%[([^%[%]|]-)|([^%[%]|]-)%]%]', '[[%1`%2]]')
local args, counter = {}, 1
for arg in mw.text.gsplit(kanjitab, '|') do
if mw.ustring.find(arg, '=') then
local _, _, k, v = mw.ustring.find(arg, '(.-)=(.*)')
k = ({ k = 'k1', o = 'o1' })[k] or tonumber(k) or k
args[k] = v
else
args[counter] = arg
counter = counter + 1
end
end
local argpos, skip = 1, 0
local result = mw.ustring.gsub(entry_title, '[' .. kanji_pattern .. ']', function(kanji)
if skip > 0 then skip = skip - 1 return '<CONCAT>' .. kanji end
local reading_kana, reading_length = '', nil
if args[argpos] then _, _, reading_kana, reading_length = mw.ustring.find(args[argpos], '^([^0-9]*)([0-9]*)$') end
if args['k' .. argpos] then reading_kana = args['k' .. argpos] end
if args['o' .. argpos] then reading_kana = reading_kana .. args['o' .. argpos] end
reading_length = reading_kana and tonumber(reading_length) or 1
skip = reading_length - 1
argpos = argpos + 1
return '[' .. kanji .. '](' .. reading_kana .. ')'
end)
for i = 1, 10 do
if not mw.ustring.find(result, '<CONCAT>') then break end
result = mw.ustring.gsub(result, '%[([^%[%]]+)%]%(([^%(%)]+)%)<CONCAT>(.)', '[%1%3](%2)')
end
table.insert(results, result)
end
return results
end
-- Try simple match first. If the result is not accurate, that is,
-- if there are zero results, or more than one result, or the result contains consecutive kanji like [書留](かきとめ),
-- then try to transclude the entry and look for its kanjitabs to decide
function export.accurate_match(kanji, kana)
local simple_results = simple_match(kanji, kana)
if #simple_results == 1 and not mw.ustring.find(simple_results[1], '[' .. kanji_pattern .. '][' .. kanji_pattern .. ']') then
return simple_results[1]
else
local kanjitab_results = extract_kanjitab_from_entry(kanji)
for _, result in ipairs(kanjitab_results) do
if mw.ustring.gsub(result, '%[([^%[%]]+)%]%(([^%(%)]+)%)', '%2') == kana then
return result
end
end
-- if all fails
return '[' .. kanji .. '](' .. kana .. ')'
end
end
return export