#!/usr/bin/ruby -Ke
#
#cidbushu.txt:
#1505,乾 l 常用 b l十日十 r乞 a u十日十人 r乙
#↑
#第1フィールド: 合成後どうなるか。コンマ区切りでいくらでも。
# すぺて同一の字形を指し示すための表現である
# 4桁以上の生数字: Adobe-Japan1(-6)におけるCID
# U+〜〜: Unicode@MSゴシック字形小塚明朝字形
# その他文字コード表現: (略)
# 生漢字1字{@90JIS}: その生漢字のバイト列が指すコードポイントの
# JIS X 0208:1990での例示字形
# 生漢字1字@78JIS: 前略JIS X 0208:1978後略
# 生漢字1字@04JIS: 前略JIS X 0213:2004後略
# その他(3桁以下の半角数字なども含む): JIS X 0208:1990に入って
# いないグリフに適宜つけた通称
#
#それより後ろ: まずスペース区切り
#l: 次のフィールドと合わせて法律的情報
#a: 分けかたは合ってる&&字形をある程度表現できてる(括弧→字形を表現できてない主要因)
#b: 分けかたは間違ってる&&字形をある程度表現できてる
#c: 正しい分けかた&&意味記述(字形が表現できてない)または異体字情報
#!c: 分割不能な象形文字等 フィールドを消費しない
#その他半角英字や'?'など1文字: 作業中である。とりあえずbと同等に扱うこと
#
#cがなくてaがあるのは
#1. aが分けかた字形意味全てにおいて完璧
#2. aで括弧ついてないのは意味も合ってる(それで括弧つきのほうは一言じゃ説明できない)
# (括弧つきのが正しい意味ならcに置けばいい)
#aと!c両方あるのは分割不能かどうか微妙
#c =?はその先が本字 たどっていくと意味が合ってる解字が出てくるはず
#c /?は自分が借字となってその字の意味を取り込んでしまったパターン
#c ?はその字の俗字だったものが別の意味を獲得(のっとられ側の字がない借字?)
#c |?はどっちがえらいとも言い難い同音同義別解字("異体""別体"もここに含むべき)
#("通じる"は/?か|?か微妙)
#
#a-cの後ろに任意長さの分解定義が続く
#a (位置を表す英字1文字)(グリフを指し示す表現) ()() ... b ()() ...
#作業中なので位置指定子が記述されていない場合がある
#l&r: 偏と旁 u&d: 冠と脚 o&i: 構や垂や繞、あとむりやり間に挟める
#p&s: 品口3の口がp(arts)で3がs(tructure) +&-: 引き算
#lまたはrとuまたはdの組み合わせは直線で切り分けられるのにしてはいけないパターン
#http://www.itscj.ipsj.or.jp/ipsj-ts/02-02/ips_charid/toc.htm
#↑のP4〜P7に近い感じ
#
#
#
#
#http://www2.odn.ne.jp/alt-quinon/files/ptex/x0213/jx2004tbl.pdf
#http://pc5.2ch.net/test/read.cgi/unix/1082032043/183
#http://www.taishukan.co.jp/kanji/archive/jinmei_minaoshi.html
def usage
$stderr.print "ruby #{$0} (--format|--sort|--to-rev{-tc}|--to-thunder|--to-chise|--for-tex) cidbushu.txt\n"
end
opt = ARGV.shift
if opt == '--format'
while gets
chop!
$_ << ' '
if $_[0..0] == "#"
puts $_
next
end
r = $_.split(/\s+/)
#p r
if r.length > 2 && r[1] == 'l'
print r[0], ' l ', r[2], ' ' * [21-r[0].length-r[2].length, 1].max
r.shift
r.shift
else
print r[0], ' ' * (24 - r[0].length)
end
r.shift
while r.length > 1 && r[0] !~ /^\#/
if r[0] =~ /^([a-z\?]|\Sc)$/
print r[0], ' ' * [2 - r[0].length, 1].max
r.shift
else
#if r[1] !~ /^([a-z\?]|\Sc)$/ && r[1] !~ /^[a-z\+\-\=\/\<\>\|]/
# r[1] = 'r' + r[1] if r[0] =~ /^l/
# r[1] = 'l' + r[1] if r[0] =~ /^r/
# r[1] = 'd' + r[1] if r[0] =~ /^u/
# r[1] = 'u' + r[1] if r[0] =~ /^d/
# r[1] = 'i' + r[1] if r[0] =~ /^o/
# r[1] = 'o' + r[1] if r[0] =~ /^i/
# r[1] = '+' + r[1] if r[0] =~ /^-/
# r[1] = '-' + r[1] if r[0] =~ /^+/
#end
r[0] = ' ' + r[0] if r[0] !~ /^[a-z\+\-\=\/\<\>\|]/
print r[0], ' ' * [12 - r[0].length, 1].max
r.shift
end
end
print r.join(' ') if r.length > 0
print "\n"
end
elsif opt == '--sort'
ali = Hash.new
def ali.[](key)
return key if ! self.has_key?(key)
super
end
fp = open('cidbushu.alias')
while fp.gets
chop!
next if $_ =~ /^\s*\#/
gsub!(/\#.*$/, '')
gsub!(/\s*$/, '')
next if $_ =~ /^(delete|overwrite|protect)/
r = $_.split(/\s+/)
next if r.length <= 1
r.shift if r[0] =~ /^(order-(strong|weak)|weak)/
d = r.shift
r.each {|i|
ali[i] = d if i.length > 2
}
end
fp.close
d = Array.new
while gets
chop!
r = $_.split(/\s+/)
if r.length < 1 || r[0] =~ /^\#/
puts $_
next
end
r = r[0]
r = r.split(/\,/)
r.delete_if {|i| i =~ /^\d{4,}$/ }
r = r[0]
#p r
d.push([r, $_])
if $_ =~ /c \=(\S+)/ && $1.length > 2
ali[$1] = r
#p r, $1
end
end
d.each {|i| i.unshift(ali[ali[ali[i[0]]]].gsub(/^(常用|旧|たて|よこ|たれ|にょう)(.)/) { $2 }) }
d.sort! {|i, j|
[i[0][0..1], i[1][0..2], j[1][2..-1]] <=> [j[0][0..1], j[1][0..2], i[1][2..-1]]
}
d.each {|i|
#p i[0..1]
puts i[2]
}
elsif opt == '--to-expand'
elsif opt =~ /^--to-rev/ || opt == '--to-thunder'
PREFER_REGULAR = false
if opt == '--to-thunder'
prefer = < [[pos, part], ...]
order_weak = []
weak = {}
ali = {}
delp = []
dels = []
def ali.[](*key)
if key.length == 1
key = key[0]
return key if ! self.has_key?(key)
super
elsif key.length == 2
pos = key[1]
key = key[0]
if pos =~ /^[亜-腕]$/ && self.has_key?(key+'(inJIS1)')
return super(key+'(inJIS1)')
elsif pos =~ /^[弌-熙]$/ && self.has_key?(key+'(inJIS2)')
return super(key+'(inJIS2)')
end
return key if ! self.has_key?(key)
super(key)
end
end
while gets
chop!
next if $_ =~ /^\s*\#/
gsub!(/\#.*$/, '')
gsub!(/\s*$/, '')
r = $_.split(/\s+/)
l = r[0]
l = l.split(/\,/)
l.delete_if {|i| i =~ /^\d{4,}$/ }
l.each {|i|
ali[i] = l[0].gsub(/\@90JIS$/, '')
}
composed = l[0].gsub(/\@90JIS$/, '')
r.shift
i = 0
parts = []
while i < r.length
if r[i] =~ /^([a-z\?]|\Sc)$/
if r[i] == 'l'
i += 2
next
elsif r[i] == '!c' || r[i] =~ /^\Sc$/
i += 1
next
end
c = r[i]
else
if r[i] =~ /^[\=\/\<\>\|]/
i += 1
next
end
l = r[i].gsub(/\((.*)\)$/) { $1 } #remove parenthesis meaning incorrect figure
l = ' ' + l if l !~ /^[a-z\+\-]/
l = [l[0..0], l[1..-1]]
parts << l
if (i == r.length-1 || r[i+1] =~ /^([a-z\?]|\Sc)$/) && parts.length > 0
if rev.has_key?([composed, c])
$stderr.print "repeated composition definition:\n"
$stderr.print "#{[composed, c].inspect} => #{rev[[composed, c]].inspect}\n#{$_}\n"
i += 1
next
end
rev[[composed, c]] = parts
parts = []
end
end
i += 1
end
end
fp = open('cidbushu.alias')
prot = []
while fp.gets
chop!
$stderr.print "#{$_}\n"
next if $_[0..0] == '#'
gsub!(/\#.*$/, '')
gsub!(/\s*$/, '')
r = $_.split(/\s+/)
case r[0]
when 'delete'
for i in 1...r.length
for j in 'abcdefijklmn?'.split(//)
next unless rev.has_key?([r[i], j])
parts = rev.delete([r[i], j])
end
end
when 'overwrite'
for j in 'abcdefijklmn?'.split(//)
next unless rev.has_key?([r[1], j])
parts = rev.delete([r[1], j])
end
parts = r[2..-1]
parts.each_index {|i|
l = parts[i]
l = ' ' + l if l !~ /^[a-z\+\-]/
l = [l[0..0], ali[ali[ali[ali[l[1..-1], r[1]], r[1]], r[1]], r[1]]]
parts[i] = l
}
rev[[r[1], 'b']] = parts
when 'protect'
prot = r[1..-1]
# when 'order-swap'
# for i in 'abcdefijklmn?'.split(//)
# next unless rev.has_key?([r[1], i])
# parts = rev[[r[1], i]]
# parts[0..1] = [parts[1], parts[0]]
# rev[[r[1], i]] = parts
# end
# rev.each {|cc, parts|
# next unless parts[0..1].collect {|i| i[1] }.include?(r[1])
# parts[0..1] = [parts[1], parts[0]]
# rev[cc] = parts
# }
###
# when 'try-making-malanalysis'
###
when 'entry-prefix-will-be-deleted'
delp += r[1..-1]
when 'entry-suffix-will-be-deleted'
dels += r[1..-1]
else
frec = nil
if r[0] == 'order-strong'
r.shift
frec = :ostrong
elsif r[0] == 'order-weak'
r.shift
frec = :oweak
elsif r[0] == 'weak'
r.shift
frec = :weak
end
for j in 1...r.length #後からoverwriteする場合の救済措置
ali[r[j]] = r[0]
end
rev.each {|cc, parts|
#next unless parts.collect {|i| i[1] }.include?(r[1])
fcr = false
for j in 1...r.length
parts.each {|k|
rj = r[j].dup
fquit = false
if r[j] =~ /\(inJIS1\)$/
rj.gsub!(/\(inJIS1\)$/, '')
#$stderr.puts cc.inspect
next unless cc[0] =~ /^[亜-腕]$/
elsif r[j] =~ /\(inJIS2\)$/
rj.gsub!(/\(inJIS2\)$/, '')
#$stderr.puts cc.inspect
next unless cc[0] =~ /^[弌-熙]$/
end
if k[1] == rj
weak[cc[0]] = [cc.collect {|l| l.dup}, parts.collect {|l| l.dup}] if frec == :weak #後からaliasされるのには未対応
k[1] = r[0]
order_weak |= [cc[0]] if frec == :oweak
fcr = true if r[0] == cc[0]
elsif k[1] == r[0]
order_weak |= [cc[0]] if frec == :ostrong
end
}
end
if fcr
#$stderr.print "deleted a crunched definition: #{cc.inspect} => #{parts.inspect}\n"
rev.delete(cc)
end
}
for i in 'abcdefijklmn?'.split(//)
for j in 1...r.length
next if prot.include?(r[j])
next unless rev.has_key?([r[j], i])
parts = rev.delete([r[j], i])
fcr = false
parts.each {|k|
fcr = true if k[1] == r[0]
}
if fcr
$stderr.print "deleted a crunched definition: #{[r[0], i].inspect} => #{parts.inspect}\n"
else
if rev.has_key?([r[0], i])
$stderr.print "overwriting a collided definition by: #{[r[0], i].inspect} => #{parts.inspect}\n"
end
rev[[r[0], i]] = parts
end
end
end
prot = []
end
end
fp.close
delp = Regexp.new('^(' + delp.collect {|i| Regexp.escape(i) }.join('|') + ')(.)$')
dels = Regexp.new('^(.)(' + dels.collect {|i| Regexp.escape(i) }.join('|') + ')$')
reva = rev.dup
rev.each {|cc, parts|
parts = parts.collect {|i| i[1] = i[1].gsub(delp) { $2 }; i }
parts = parts.collect {|i| i[1] = i[1].gsub(dels) { $1 }; i }
if parts.collect {|i| i[1] }.include?(cc[0])
$stderr.print "deleted a crunched definition: #{cc.inspect} => #{parts.inspect}\n"
rev.delete(cc)
next
end
rev[cc] = parts
if cc[0].length > 2 || parts.collect {|i| i[1].length }.max > 2
$stderr.puts [cc, parts].inspect unless cc[0] =~ /@\d\dJIS$/
rev.delete(cc)
next
end
parts.each {|i|
if i[1] =~ /\?/
rev.delete(cc)
break
end
}
}
rev2 = {}
rev.each {|cc, parts|
if rev2.has_key?(cc[0])
p = (rev2[cc[0]]=~/^[a-c]$/ ? rev2[cc[0]] : 'b'+rev2[cc[0]])
#p cc[1]
n = (cc[1]=~/^[a-c]$/ ? cc[1] : 'b'+cc[1])
if PREFER_REGULAR
else
p = p.gsub(/^([ab])/) { $1 == 'a' ? 'b' : 'a' }
n = n.gsub(/^([ab])/) { $1 == 'a' ? 'b' : 'a' }
end
if p > n
rev2[cc[0]] = cc[1]
end
else
rev2[cc[0]] = cc[1]
end
}
rev.each {|cc, parts|
next if rev2[cc[0]] == cc[1]
rev.delete(cc)
}
fow = {}
rev2 = {}
rev.each {|cc, parts|
parts0 = parts
parts = parts.collect {|i| i[1] } #delete pos data
if fow.has_key?(parts)
# $stderr.puts [cc, parts].inspect, (order_weak.include?(fow[parts][0]) ? 2 : 0)+(order_weak.include?(cc[0]) ? 1 : 0) if cc[0] =~ /[壌壤]/
case (weak.has_key?(fow[parts][0]) ? 2 : 0)+(weak.has_key?(cc[0]) ? 1 : 0)
when 0
#$stderr.print "undefined priority: #{fow[parts][0]}, #{cc[0]}\n"
when 3
#$stderr.print "can't determine priority: #{fow[parts][0]}, #{cc[0]}\n"
when 1
$stderr.print "canceling a replace: #{cc[0]}, #{parts}, #{weak[cc[0]][1]}\n"
parts0 = weak[cc[0]][1].dup
parts = parts0.collect {|i| i[1] }
when 2
cc2 = fow[parts]
fow[parts] = cc
cc = cc2
parts02 = rev2.delete(cc)
rev2[fow[parts]] = parts0
parts0 = parts02
$stderr.print "canceling a replace: #{cc[0]}, #{parts}, #{weak[cc[0]][1]}\n"
parts0 = weak[cc[0]][1].dup
parts = parts0.collect {|i| i[1] }
end
end
if fow.has_key?(parts)
# $stderr.puts [cc, parts].inspect, (order_weak.include?(fow[parts][0]) ? 2 : 0)+(order_weak.include?(cc[0]) ? 1 : 0) if cc[0] =~ /[壌壤]/
case (order_weak.include?(fow[parts][0]) ? 2 : 0)+(order_weak.include?(cc[0]) ? 1 : 0)
when 0
$stderr.print "undefined order-priority: #{fow[parts][0]}, #{cc[0]}\n"
when 3
$stderr.print "can't determine order-priority: #{fow[parts][0]}, #{cc[0]}\n"
when 1
parts0 = parts0.dup
parts0[0..1] = [parts0[1], parts0[0]]
parts = parts0.collect {|i| i[1] }
when 2
cc2 = fow[parts]
fow[parts] = cc
cc = cc2
parts02 = rev2.delete(cc)
rev2[fow[parts]] = parts0
parts0 = parts02
parts0 = parts0.dup
parts0[0..1] = [parts0[1], parts0[0]]
parts = parts0.collect {|i| i[1] }
end
end
if fow.has_key?(parts)
$stderr.print "too crowded: #{fow[parts][0]}, #{cc[0]}\n"
end
fow[parts] = cc
rev2[cc] = parts0
}
rev2 = rev2.collect {|cc, parts|
[cc[0]] + parts.collect {|i| i[1] } + [cc[1]] + parts.collect {|i| i[0] }
}
rev2.sort! {|i, j| i[0] <=> j[0] }
rev2.each {|i|
if opt =~ /^--to-rev/
puts i[0...i.length/2].to_s
elsif opt == '--to-thunder'
next if i[0] == '裁'
puts i[0, 3].to_s + i[i.length/2, 3].to_s
end
}
elsif opt == '--to-chise'
while gets
chop!
next if $_ =~ /^\s*\#/
gsub!(/\#.*$/, '')
gsub!(/\s*$/, '')
r = $_.split(/\s+/)
l = r[0]
l = l.split(/\,/)
l.delete_if {|i| i =~ /^\d{4,}$/ }
l[1..-1].each {|i|
ali[i] = l[0].gsub(/\@90JIS$/, '')
}
composed = l[0].gsub(/\@90JIS$/, '')
r.shift
i = 0
parts = []
fparen = false
while i < r.length
if r[i] =~ /^([a-z\?]|\Sc)$/
if r[i] == 'l'
i += 2
next
elsif r[i] == '!c' || r[i] =~ /^\Sc$/
i += 1
next
end
c = r[i]
else
if r[i] =~ /^[\=\/\<\>\|]/
i += 1
next
end
l = r[i]
fparen = true if l =~ /\((.*)\)$/
l = ' ' + l if l !~ /^[a-z\+\-]/
l = [l[0..0], l[1..-1]]
parts << l
if (i == r.length-1 || r[i+1] =~ /^([a-z\?]|\Sc)$/) && parts.length > 0
if c !~ /^[ab]$/ || fparen
parts = []
fparen = false
i += 1
next
end
if rev.has_key?([composed, c])
$stderr.print "repeated composition definition:\n"
$stderr.print "#{[composed, c].inspect} => #{rev[[composed, c]].inspect}\n#{$_}\n"
i += 1
next
end
rev[[composed, c]] = parts
parts = []
fparen = false
end
end
i += 1
end
end
def xx(c1,c2); "U+2FFB(#{c1},#{c2})"; end
def lr(c1,c2); "U+2FF0(#{c1},#{c2})"; end
def ud(c1,c2); "U+2FF1(#{c1},#{c2})"; end
def oi(c1,c2,c0)
if c0 =~ /^()$/
elsif c0 =~ /^(广|厂|やまいだれ|尸|在の外|虍|广廿|厂林|厂イ|雁たれ)$/
"U+2FF8(#{c1},#{c2})"
elsif c0 =~ /^(囗四)$/
"U+2FF4(#{c1},#{c2})"
elsif c0 =~ /^(門|冂|岡ひく山|風|微かまえ|戊|戌)$/
"U+2FF5(#{c1},#{c2})"
elsif c0 =~ /^(勹|裁かまえ|武ひく止|)$/
"U+2FF9(#{c1},#{c2})"
elsif c0 =~ /^(匚|匸)$/
"U+2FF7(#{c1},#{c2})"
elsif c0 =~ /^(凵)$/
"U+2FF6(#{c1},#{c2})"
elsif c0 =~ /^(之|え|廴|走|夂)$/
"U+2FFA(#{c1},#{c2})"
elsif c0 =~ /^(行|衣)$/
#U+2FF3
#o->u+d, cf.'udi'
"U+2FFB(#{c1},#{c2})"
end
end
rev2 = []
rev.each {|cc, parts|
parts.sort! {|i, j| %w(l r u d o i p s m x + -).index(i[0]) <=> %w(l r u d o i p s m x + -).index(j[0]) }
case parts.collect {|i| i[0]}.to_s
when '+-'
next
when 'xx'
rev2 << [cc, xx(parts[0][1],parts[1][1])]
when 'mm'
rev2 << [cc, xx(parts[0][1],parts[1][1])]
when 'lr'
rev2 << [cc, lr(parts[0][1],parts[1][1])]
when 'ud'
rev2 << [cc, ud(parts[0][1],parts[1][1])]
when 'oi'
rev2 << [cc, oi(parts[0][1],parts[1][1], cc[0])]
when 'lu'
rev2 << [cc, "U+2FF9(#{parts[1][1]},#{parts[0][1]})"]
#u->l2+r, lr(ud(l2,l),r) or
#u->u2+d, ud(u2,lr(l,d))
when 'ld'
#
when 'ru'
rev2 << [cc, "U+2FF8(#{parts[1][1]},#{parts[0][1]})"]
when 'rd'
rev2 << [cc, "U+2FFA(#{parts[1][1]},#{parts[0][1]})"]
when 'lri'
rev2 << [cc, "U+2FF2(#{parts[0][1]},#{parts[2][1]},#{parts[1][1]})"]
when 'udi'
rev2 << [cc, "U+2FF3(#{parts[0][1]},#{parts[2][1]},#{parts[1][1]})"]
when 'lru'
rev2 << [cc, ud(parts[2][1], lr(parts[0][1], parts[1][1]))]
when 'lrd'
rev2 << [cc, ud(lr(parts[0][1], parts[1][1]), parts[2][1])]
when 'udo'
rev2 << [cc, oi(parts[2][1], ud(parts[0][1], parts[1][1]))]
when 'uo'
#o->o2+i, oi(o2,ud(u,i))
raise notImplementedError, "#{parts.collect {|i| i[0]}.to_s}"
when 'do'
raise notImplementedError, "#{parts.collect {|i| i[0]}.to_s}"
else
raise notImplementedError, "#{parts.collect {|i| i[0]}.to_s}"
end
}
elsif opt == '--for-tex'
#http://psitau.at.infoseek.co.jp/otf.html
#http://oku.edu.mie-u.ac.jp/~okumura/texwiki/?OTF
puts '\documentclass{jarticle}'
puts '\usepackage{utf}'
puts '\begin{document}'
while gets
chop!
gsub!(/\s\#$/, '')
gsub!(/ /, ' ')
gsub!(/[\#$%&_{}^~<>\\|]/) {|i| i =~ /[<>\\]/ ? '\verb|'+i+'|' : i == '|' ? '\textbar ' : '\\'+i }
gsub!(/^\d{4,}/) { '{\small '+$&+'}\CID{'+$&+'}' }
gsub!(/U\+([0-9A-F]{4})/) { $&+'\UTF{'+$1+'}' }
puts $_
puts
end
puts '\end{document}'
elsif false
itaiji = {}
fp = open('./emacs/tcode/itaiji.maz')
while fp.gets
chop!
i = split(/\s/)
itaiji[i[0]] = i[1]
end
fp.close
while gets
chop!
if $_ =~ /^\s*\#/
puts $_
next
end
r = $_.split(/\s+/)
l = r[0]
l = l.split(/\,/)
l.delete_if {|i| i =~ /^\d{4,}$/ }
c = l[0].gsub(/\@90JIS$/, '')
if itaiji[c]
$_ << " #c =#{itaiji[$1]} "
end
puts $_
end
elsif false
#ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/adobe/
while gets
chop!
next if $_ =~ /^\#/
next if $_ =~ /^CID/
codes = $_.split(/\t/)
cid = codes[0].to_i
c90jis = codes[1].hex
c78jis = codes[13].hex
next if c78jis == 0 && c90jis == 0
w = ' '
w[0] = (c90jis>>8)+0x80
w[1] = (c90jis&0xff)+0x80
c90jis = w
w = ' '
w[0] = (c78jis>>8)+0x80
w[1] = (c78jis&0xff)+0x80
c78jis = w
if c90jis == c78jis
printf "%-8d\#%s\n", cid, c90jis
else
l = []
l << c90jis+"@90JIS" if c90jis.length > 0
l << c78jis+"@78JIS" if c78jis.length > 0
printf "%-8d#%s\n", cid, l.join(",")
end
end
else
usage()
end ####