--[=[
TODO: Handle cases like this:
===Pronoun===
{{ja-pos|pronoun|かのじょ}}
# {{ja-def|彼女}} [[she]]; [[her]]
===Noun===
{{ja-noun|かのじょ|カノジョ}}
# {{ja-def|彼女|カノジョ}} [[girlfriend]]
The {{ja-see}} template on [[カノジョ]] should take care not to categorize the term as a pronoun, in addition to ignoring the first {{ja-def}} definition line.
]=]
local export = {}
local len = mw.ustring.len
local sub = mw.ustring.sub
local gsub = mw.ustring.gsub
local find = mw.ustring.find
local match = mw.ustring.match
local gmatch = mw.ustring.gmatch
local m_ja = require('Module:ja')
local kanji_pattern = "一-鿿㐀-䶿﨎﨏﨑﨓﨔﨟﨡﨣﨤﨧-﨩𠀀-𪛟𪜀-𰀀-"
local japanese_pattern = 'ぁ-ゖァ-ヺー' .. kanji_pattern .. 'a-zA-Z0-9〆々'
local headword_templates = {
['ja-adj'] = true, ['ja-pos'] = true, ['ja-noun'] = true, ['ja-phrase'] = true,
['ja-verb'] = true, ['ja-verb form'] = true, ['ja-verb-suru'] = true,
}
local function find_headword_template(wikitext)
local index =
wikitext:find('{{ja%-adj[|}]') or
wikitext:find('{{ja%-pos[|}]') or
wikitext:find('{{ja%-noun[|}]') or
wikitext:find('{{ja%-phrase[|}]') or
wikitext:find('{{ja%-verb[|}]') or
wikitext:find('{{ja%-verb form[|}]') or
wikitext:find('{{ja%-verb%-suru[|}]')
if index then
-- This assumes that the template has matching braces.
return wikitext:match('%b{}', index)
end
end
local function contains(list, item)
for i = 1, #list do
if list[i] == item then return true end
end
return false
end
-- Matches between str1 up to str2 or end of string.
local function match_between(source, str1, str2)
local i = select(2, source:find(str1))
if not i then return end
local j = source:find(str2, i)
return i and source:sub(i + 1, j and j - 1)
end
-- Returns L3 sections in format { { header, wikitext }, ... } as well as text above the first L3 section with header "".
local function get_l3_sections(wikitext)
local l3_sections = {}
local multi_etym = false
-- special hack mentioned below
if not wikitext:find('===Etymology 1===') then
wikitext = wikitext:gsub('{{ja%-spellings', '=== ===\n{{ja-spellings')
wikitext = wikitext:gsub('{{ja%-kanjitab', '=== ===\n{{ja-kanjitab')
end
local header, pos, header_end, prev_header, prev_header_start, prev_header_end
while true do
header_start, header_end, header = wikitext:find('%f[=]===([^=]+)===', prev_header_end)
if header == 'Etymology 1' then multi_etym = true end
local section_content = wikitext:sub(prev_header_start or 0, (header_start or 0) - 1)
table.insert(l3_sections, { prev_header or '', section_content })
if not header_start then
break
end
prev_header, prev_header_start, prev_header_end = header, header_start, header_end
end
return l3_sections, multi_etym
end
-- A function to parse Japanese entries, returning a list of etym sections, each having the form { wikitext, type = ( 'lemma' | 'redirect' | '' ), keys = <a list of alternative spellings> }. In case of multiple etymologies, each ===Etymology n=== part constitutes an etym section. Otherwise, the whole Japanese section minus any ===Kanji [n]=== subsections constitutes a single etym section.
-- Note: The function divides sections strictly by L3 headers. As a result:
-- (1) If an entry describes both a kanji and a single word, any templates beginning the word (such as {{ja-spellings}}) will be erroneously considered part of the kanji section above. This function only remedies the cases of {{ja-spellings}} and {{ja-kanjitab}}, by inserting an empty header === === above it before parsing. (This problem is absent for entries with multiple etymologies, since each word must begin with ===Etymology n===.)
-- (2) If an entry describes multiple words, word-specific templates such as {{topics|ja|Biology}} must now be placed at the end of the relevant word instead of the whole entry. If they are put at the end of the ==Japanese== entry, they will be either erroneously considered part of the final word or additional sections such as ===References===, and ignored when {{ja-see}} copies categories around.
local function extract_etym_sections(lemma)
local page = mw.title.new(lemma):getContent() or ''
local l2 = match_between(page, '==Japanese==\n', '%-%-%-%-')
-- split into L3 sections
local l3_sections, multi_etym
if l2 then
l3_sections, multi_etym = get_l3_sections(l2)
end
-- group the L3 sections into etym sections
local etym_sections = {}
if multi_etym then
for _, v in ipairs(l3_sections) do
local header, content = unpack(v)
if find(header, '^Etymology %d+$') then
table.insert(etym_sections, content)
end
end
else
local word = {}
for _, v in ipairs(l3_sections) do
local header, content = unpack(v)
if not (header and header:find("^Kanji ?%d*$")) then
table.insert(word, content)
end
end
word = table.concat(word, '\n')
table.insert(etym_sections, word)
end
-- finally, determine the type of each etym section
for i = 1, #etym_sections do
etym_section = etym_sections[i]
local ja_see = find(etym_section, '{{ja%-see[|}]') or find(etym_section, '{{ja%-see-kango[|}]')
if ja_see then
local keys = {}
for v in gmatch(etym_section:match('%b{}', ja_see), '[' .. japanese_pattern .. ']+') do
table.insert(keys, v)
end
etym_sections[i] = { etym_section, type = 'redirect', keys = keys }
else
local ja_forms = find(etym_section, '{{ja%-spellings[|}]')
if ja_forms then
local keys = {}
for v in gmatch(etym_section:match('%b{}', ja_forms), '[' .. japanese_pattern .. ']+') do
table.insert(keys, v)
end
etym_sections[i] = { etym_section, type = 'lemma', keys = keys }
else
local headword_template = find_headword_template(etym_section)
if headword_template then
local keys = {}
for v in gmatch(headword_template:gsub('[ ^%-%.]', ''), '[' .. japanese_pattern .. ']+') do
table.insert(keys, v)
end
etym_sections[i] = { etym_section, type = 'lemma', keys = keys }
else
etym_sections[i] = { etym_section, type = '', keys = {} }
end
end
end
end
return etym_sections
end
export.extract_etym_sections = extract_etym_sections
-- A function to parse Japanese entries based to the function above, but filters the result and finds the etym section with the alternative spelling given by the key, and returns it as wikitext.
function export.get_etym_section(lemma, key, frame)
local words = extract_etym_sections(lemma, frame)
local result = {}
for _, v in ipairs(words) do
if v.type == 'lemma' and contains(v.keys, key) then
table.insert(result, v[1])
end
end
local wikitext = table.concat(result, '\n')
return wikitext
end
local templates_to_exclude = {
-- These templates are ignored as an optimization since they don't generate categories.
['m'] = true, ['l'] = true, ['ja-l'] = true, ['ja-r'] = true, ['gloss'] = true,
['w'] = true, ['wp'] = true, ['swp'] = true, ['wikipedia'] = true,
['lang'] = true, ['furigana'] = true, ['wj'] = true, ['lj'] = true, ['ruby/ja-w2'] = true, ['ruby/ja'] = true, ['ruby'] = true,
['ja-kanji forms'] = true, ['w2'] = true, ['sense'] = true,
['IPAfont'] = true, ['IPAchar'] = true,
['ja-adj-infl'] = true, ['ja-i'] = true, ['ja-na'] = true, ['ja-adjdecl'] = true, ['ja-decl-na'] = true, ['ja-go-bu'] = true, ['ja-go-gu'] = true, ['ja-go-ku'] = true, ['ja-go-mu'] = true, ['ja-go-nu'] = true, ['ja-go-ou'] = true, ['ja-go-ru'] = true, ['ja-go-su'] = true, ['ja-go-tsu'] = true, ['ja-go-u'] = true, ['ja-honorific'] = true, ['ja-ichi'] = true, ['ja-kuru'] = true, ['ja-suru'] = true, ['ja-suru-i-ku'] = true, ['ja-suru-tsu'] = true, ['ja-verbconj'] = true, ['ja-verbconj-auto'] = true, ['ja-verbconj-row'] = true, ['ja-verbconjugation'] = true, ['ja-zuru'] = true,
['ja-kanji spellings'] = true, ['ja-ks'] = true, ['ja-spellings'] = true, ['ja-forms'] = true,
['Japanese first-person pronouns'] = true, ['der-top'] = true, ['der-bottom'] = true,
['der-mid'] = true, ['der-top3'] = true, ['der-top4'] = true, ['der-top5'] = true, ['rel-top'] = true,
-- These templates are ignored since they generate categories that are spelling-specific or that we're not interested in
['ja-kanjitab'] = true, ['ateji'] = true, ['ja-ateji'] = true, ['ja-kanji'] = true, ['ja-readings'] = true,
['ja-def'] = true, ['synonyms'] = true,
}
function export.extract_definitions_and_categories(wikitext, source, key, frame)
local def = {}
local cat = {}
local current_section = ''
local key_contains_kanji = find(key, '[' .. kanji_pattern .. ']') ~= nil
local prev_pos = 0
-- Search for headers and definition lines (numbered lists) and insert
-- any intervening text into cat.
for start_pos, line, first_char, end_pos in wikitext:gmatch('%f[^\n]()(([#=]+)%f[^#=:*][^\n]+)()') do
if first_char == "#" then
if not line:find('{{rfdef') and not (key_contains_kanji and line:find('{{ja%-def|') and not line:find('|' .. key .. '[|}]')) then
table.insert(def, { line, pos = current_section })
end
else
current_section = line:match("^=*(.-)=*$")
end
if start_pos - prev_pos > 1 then
table.insert(cat, wikitext:sub(prev_pos + 1, start_pos - 1))
end
prev_pos = end_pos
end
table.insert(cat, wikitext:sub(prev_pos + 1)) -- add last bit
-- expand the other parts for categories
local cat = table.concat(cat, '\n')
cat = cat:gsub('<ref[ >].-</ref>', '')
cat = cat:gsub('<references/>', '')
local function process_template_header(a, b) -- if the template begins with "{{ja-usex|", a is "ja-usex" and b is "|".
if templates_to_exclude[a] then
return '{{=' .. b
elseif headword_templates[a] then
local source_script = m_ja.script(source)
if source_script == 'Hira' or source_script == 'Kana' or source_script == 'Hira+Kana' then
return '{{' .. a .. '|' .. source .. b
else
return '{{' .. a .. b
end
elseif a == 'ja-usex' or a:find('^quote') then -- special hack
return '[[Category:Japanese terms with usage examples]]{{=' .. b
else
return '{{' .. a .. b
end
end
cat = cat:gsub('{{(.-)%s*([|}])', process_template_header)
cat = cat:gsub('{{ja%-pron.-}}', function(pron)
local result = {}
if not find(pron, '|noipa=') then
table.insert(result, '[[Category:Japanese terms with IPA pronunciation]]')
end
if find(pron, '|a=') or find(pron, '|audio=') then
table.insert(result, '[[Category:Japanese terms with audio links]]')
end
return table.concat(result)
end)
cat = frame:preprocess(cat)
local cat2 = {}
for i in cat:gmatch('%[%[Category:.-%]%]') do table.insert(cat2, i) end
cat = table.concat(cat2)
-- one might want to modify the sortkeys here
return def, cat
end
return export