#!/usr/bin/ruby -Ke # #cidbushu.txt: #1505,乾 l 常用 b l十日十 r乞 a u十日十人 r乙 #↑ #第1フィールド: 合成後どうなるか。コンマ区切りでいくらでも。 # すぺて同一の字形を指し示すための表現である # 4桁以上の生数字: Adobe-Japan1(-6)におけるCID # U+〜〜: Unicode@MSゴシック字形小塚明朝字形 # その他文字コード表現: (略) # 生漢字1字{@90JIS}: その生漢字のバイト列が指すコードポイントの # JIS X 0208:1990での例示字形 # 生漢字1字@78JIS: 前略JIS X 0208:1978後略 # 生漢字1字@04JIS: 前略JIS X 0213:2004後略 # その他(3桁以下の半角数字なども含む): JIS X 0208:1990に入って # いないグリフに適宜つけた通称 # #それより後ろ: まずスペース区切り #l: 次のフィールドと合わせて法律的情報 #a: 分けかたは合ってる&&字形をある程度表現できてる(括弧→字形を表現できてない主要因) #b: 分けかたは間違ってる&&字形をある程度表現できてる #c: 正しい分けかた&&意味記述(字形が表現できてない)または異体字情報 #!c: 分割不能な象形文字等 フィールドを消費しない #その他半角英字や'?'など1文字: 作業中である。とりあえずbと同等に扱うこと # #cがなくてaがあるのは #1. aが分けかた字形意味全てにおいて完璧 #2. aで括弧ついてないのは意味も合ってる(それで括弧つきのほうは一言じゃ説明できない) # (括弧つきのが正しい意味ならcに置けばいい) #aと!c両方あるのは分割不能かどうか微妙 #c =?はその先が本字 たどっていくと意味が合ってる解字が出てくるはず #c /?は自分が借字となってその字の意味を取り込んでしまったパターン #c ?はその字の俗字だったものが別の意味を獲得(のっとられ側の字がない借字?) #c |?はどっちがえらいとも言い難い同音同義別解字("異体""別体"もここに含むべき) #("通じる"は/?か|?か微妙) # #a-cの後ろに任意長さの分解定義が続く #a (位置を表す英字1文字)(グリフを指し示す表現) ()() ... b ()() ... #作業中なので位置指定子が記述されていない場合がある #l&r: 偏と旁 u&d: 冠と脚 o&i: 構や垂や繞、あとむりやり間に挟める #p&s: 品口3の口がp(arts)で3がs(tructure) +&-: 引き算 #lまたはrとuまたはdの組み合わせは直線で切り分けられるのにしてはいけないパターン #http://www.itscj.ipsj.or.jp/ipsj-ts/02-02/ips_charid/toc.htm #↑のP4〜P7に近い感じ # # # # #http://www2.odn.ne.jp/alt-quinon/files/ptex/x0213/jx2004tbl.pdf #http://pc5.2ch.net/test/read.cgi/unix/1082032043/183 #http://www.taishukan.co.jp/kanji/archive/jinmei_minaoshi.html def usage $stderr.print "ruby #{$0} (--format|--sort|--to-rev{-tc}|--to-thunder|--to-chise|--for-tex) cidbushu.txt\n" end opt = ARGV.shift if opt == '--format' while gets chop! $_ << ' ' if $_[0..0] == "#" puts $_ next end r = $_.split(/\s+/) #p r if r.length > 2 && r[1] == 'l' print r[0], ' l ', r[2], ' ' * [21-r[0].length-r[2].length, 1].max r.shift r.shift else print r[0], ' ' * (24 - r[0].length) end r.shift while r.length > 1 && r[0] !~ /^\#/ if r[0] =~ /^([a-z\?]|\Sc)$/ print r[0], ' ' * [2 - r[0].length, 1].max r.shift else #if r[1] !~ /^([a-z\?]|\Sc)$/ && r[1] !~ /^[a-z\+\-\=\/\<\>\|]/ # r[1] = 'r' + r[1] if r[0] =~ /^l/ # r[1] = 'l' + r[1] if r[0] =~ /^r/ # r[1] = 'd' + r[1] if r[0] =~ /^u/ # r[1] = 'u' + r[1] if r[0] =~ /^d/ # r[1] = 'i' + r[1] if r[0] =~ /^o/ # r[1] = 'o' + r[1] if r[0] =~ /^i/ # r[1] = '+' + r[1] if r[0] =~ /^-/ # r[1] = '-' + r[1] if r[0] =~ /^+/ #end r[0] = ' ' + r[0] if r[0] !~ /^[a-z\+\-\=\/\<\>\|]/ print r[0], ' ' * [12 - r[0].length, 1].max r.shift end end print r.join(' ') if r.length > 0 print "\n" end elsif opt == '--sort' ali = Hash.new def ali.[](key) return key if ! self.has_key?(key) super end fp = open('cidbushu.alias') while fp.gets chop! next if $_ =~ /^\s*\#/ gsub!(/\#.*$/, '') gsub!(/\s*$/, '') next if $_ =~ /^(delete|overwrite|protect)/ r = $_.split(/\s+/) next if r.length <= 1 r.shift if r[0] =~ /^(order-(strong|weak)|weak)/ d = r.shift r.each {|i| ali[i] = d if i.length > 2 } end fp.close d = Array.new while gets chop! r = $_.split(/\s+/) if r.length < 1 || r[0] =~ /^\#/ puts $_ next end r = r[0] r = r.split(/\,/) r.delete_if {|i| i =~ /^\d{4,}$/ } r = r[0] #p r d.push([r, $_]) if $_ =~ /c \=(\S+)/ && $1.length > 2 ali[$1] = r #p r, $1 end end d.each {|i| i.unshift(ali[ali[ali[i[0]]]].gsub(/^(常用|旧|たて|よこ|たれ|にょう)(.)/) { $2 }) } d.sort! {|i, j| [i[0][0..1], i[1][0..2], j[1][2..-1]] <=> [j[0][0..1], j[1][0..2], i[1][2..-1]] } d.each {|i| #p i[0..1] puts i[2] } elsif opt == '--to-expand' elsif opt =~ /^--to-rev/ || opt == '--to-thunder' PREFER_REGULAR = false if opt == '--to-thunder' prefer = < [[pos, part], ...] order_weak = [] weak = {} ali = {} delp = [] dels = [] def ali.[](*key) if key.length == 1 key = key[0] return key if ! self.has_key?(key) super elsif key.length == 2 pos = key[1] key = key[0] if pos =~ /^[亜-腕]$/ && self.has_key?(key+'(inJIS1)') return super(key+'(inJIS1)') elsif pos =~ /^[弌-熙]$/ && self.has_key?(key+'(inJIS2)') return super(key+'(inJIS2)') end return key if ! self.has_key?(key) super(key) end end while gets chop! next if $_ =~ /^\s*\#/ gsub!(/\#.*$/, '') gsub!(/\s*$/, '') r = $_.split(/\s+/) l = r[0] l = l.split(/\,/) l.delete_if {|i| i =~ /^\d{4,}$/ } l.each {|i| ali[i] = l[0].gsub(/\@90JIS$/, '') } composed = l[0].gsub(/\@90JIS$/, '') r.shift i = 0 parts = [] while i < r.length if r[i] =~ /^([a-z\?]|\Sc)$/ if r[i] == 'l' i += 2 next elsif r[i] == '!c' || r[i] =~ /^\Sc$/ i += 1 next end c = r[i] else if r[i] =~ /^[\=\/\<\>\|]/ i += 1 next end l = r[i].gsub(/\((.*)\)$/) { $1 } #remove parenthesis meaning incorrect figure l = ' ' + l if l !~ /^[a-z\+\-]/ l = [l[0..0], l[1..-1]] parts << l if (i == r.length-1 || r[i+1] =~ /^([a-z\?]|\Sc)$/) && parts.length > 0 if rev.has_key?([composed, c]) $stderr.print "repeated composition definition:\n" $stderr.print "#{[composed, c].inspect} => #{rev[[composed, c]].inspect}\n#{$_}\n" i += 1 next end rev[[composed, c]] = parts parts = [] end end i += 1 end end fp = open('cidbushu.alias') prot = [] while fp.gets chop! $stderr.print "#{$_}\n" next if $_[0..0] == '#' gsub!(/\#.*$/, '') gsub!(/\s*$/, '') r = $_.split(/\s+/) case r[0] when 'delete' for i in 1...r.length for j in 'abcdefijklmn?'.split(//) next unless rev.has_key?([r[i], j]) parts = rev.delete([r[i], j]) end end when 'overwrite' for j in 'abcdefijklmn?'.split(//) next unless rev.has_key?([r[1], j]) parts = rev.delete([r[1], j]) end parts = r[2..-1] parts.each_index {|i| l = parts[i] l = ' ' + l if l !~ /^[a-z\+\-]/ l = [l[0..0], ali[ali[ali[ali[l[1..-1], r[1]], r[1]], r[1]], r[1]]] parts[i] = l } rev[[r[1], 'b']] = parts when 'protect' prot = r[1..-1] # when 'order-swap' # for i in 'abcdefijklmn?'.split(//) # next unless rev.has_key?([r[1], i]) # parts = rev[[r[1], i]] # parts[0..1] = [parts[1], parts[0]] # rev[[r[1], i]] = parts # end # rev.each {|cc, parts| # next unless parts[0..1].collect {|i| i[1] }.include?(r[1]) # parts[0..1] = [parts[1], parts[0]] # rev[cc] = parts # } ### # when 'try-making-malanalysis' ### when 'entry-prefix-will-be-deleted' delp += r[1..-1] when 'entry-suffix-will-be-deleted' dels += r[1..-1] else frec = nil if r[0] == 'order-strong' r.shift frec = :ostrong elsif r[0] == 'order-weak' r.shift frec = :oweak elsif r[0] == 'weak' r.shift frec = :weak end for j in 1...r.length #後からoverwriteする場合の救済措置 ali[r[j]] = r[0] end rev.each {|cc, parts| #next unless parts.collect {|i| i[1] }.include?(r[1]) fcr = false for j in 1...r.length parts.each {|k| rj = r[j].dup fquit = false if r[j] =~ /\(inJIS1\)$/ rj.gsub!(/\(inJIS1\)$/, '') #$stderr.puts cc.inspect next unless cc[0] =~ /^[亜-腕]$/ elsif r[j] =~ /\(inJIS2\)$/ rj.gsub!(/\(inJIS2\)$/, '') #$stderr.puts cc.inspect next unless cc[0] =~ /^[弌-熙]$/ end if k[1] == rj weak[cc[0]] = [cc.collect {|l| l.dup}, parts.collect {|l| l.dup}] if frec == :weak #後からaliasされるのには未対応 k[1] = r[0] order_weak |= [cc[0]] if frec == :oweak fcr = true if r[0] == cc[0] elsif k[1] == r[0] order_weak |= [cc[0]] if frec == :ostrong end } end if fcr #$stderr.print "deleted a crunched definition: #{cc.inspect} => #{parts.inspect}\n" rev.delete(cc) end } for i in 'abcdefijklmn?'.split(//) for j in 1...r.length next if prot.include?(r[j]) next unless rev.has_key?([r[j], i]) parts = rev.delete([r[j], i]) fcr = false parts.each {|k| fcr = true if k[1] == r[0] } if fcr $stderr.print "deleted a crunched definition: #{[r[0], i].inspect} => #{parts.inspect}\n" else if rev.has_key?([r[0], i]) $stderr.print "overwriting a collided definition by: #{[r[0], i].inspect} => #{parts.inspect}\n" end rev[[r[0], i]] = parts end end end prot = [] end end fp.close delp = Regexp.new('^(' + delp.collect {|i| Regexp.escape(i) }.join('|') + ')(.)$') dels = Regexp.new('^(.)(' + dels.collect {|i| Regexp.escape(i) }.join('|') + ')$') reva = rev.dup rev.each {|cc, parts| parts = parts.collect {|i| i[1] = i[1].gsub(delp) { $2 }; i } parts = parts.collect {|i| i[1] = i[1].gsub(dels) { $1 }; i } if parts.collect {|i| i[1] }.include?(cc[0]) $stderr.print "deleted a crunched definition: #{cc.inspect} => #{parts.inspect}\n" rev.delete(cc) next end rev[cc] = parts if cc[0].length > 2 || parts.collect {|i| i[1].length }.max > 2 $stderr.puts [cc, parts].inspect unless cc[0] =~ /@\d\dJIS$/ rev.delete(cc) next end parts.each {|i| if i[1] =~ /\?/ rev.delete(cc) break end } } rev2 = {} rev.each {|cc, parts| if rev2.has_key?(cc[0]) p = (rev2[cc[0]]=~/^[a-c]$/ ? rev2[cc[0]] : 'b'+rev2[cc[0]]) #p cc[1] n = (cc[1]=~/^[a-c]$/ ? cc[1] : 'b'+cc[1]) if PREFER_REGULAR else p = p.gsub(/^([ab])/) { $1 == 'a' ? 'b' : 'a' } n = n.gsub(/^([ab])/) { $1 == 'a' ? 'b' : 'a' } end if p > n rev2[cc[0]] = cc[1] end else rev2[cc[0]] = cc[1] end } rev.each {|cc, parts| next if rev2[cc[0]] == cc[1] rev.delete(cc) } fow = {} rev2 = {} rev.each {|cc, parts| parts0 = parts parts = parts.collect {|i| i[1] } #delete pos data if fow.has_key?(parts) # $stderr.puts [cc, parts].inspect, (order_weak.include?(fow[parts][0]) ? 2 : 0)+(order_weak.include?(cc[0]) ? 1 : 0) if cc[0] =~ /[壌壤]/ case (weak.has_key?(fow[parts][0]) ? 2 : 0)+(weak.has_key?(cc[0]) ? 1 : 0) when 0 #$stderr.print "undefined priority: #{fow[parts][0]}, #{cc[0]}\n" when 3 #$stderr.print "can't determine priority: #{fow[parts][0]}, #{cc[0]}\n" when 1 $stderr.print "canceling a replace: #{cc[0]}, #{parts}, #{weak[cc[0]][1]}\n" parts0 = weak[cc[0]][1].dup parts = parts0.collect {|i| i[1] } when 2 cc2 = fow[parts] fow[parts] = cc cc = cc2 parts02 = rev2.delete(cc) rev2[fow[parts]] = parts0 parts0 = parts02 $stderr.print "canceling a replace: #{cc[0]}, #{parts}, #{weak[cc[0]][1]}\n" parts0 = weak[cc[0]][1].dup parts = parts0.collect {|i| i[1] } end end if fow.has_key?(parts) # $stderr.puts [cc, parts].inspect, (order_weak.include?(fow[parts][0]) ? 2 : 0)+(order_weak.include?(cc[0]) ? 1 : 0) if cc[0] =~ /[壌壤]/ case (order_weak.include?(fow[parts][0]) ? 2 : 0)+(order_weak.include?(cc[0]) ? 1 : 0) when 0 $stderr.print "undefined order-priority: #{fow[parts][0]}, #{cc[0]}\n" when 3 $stderr.print "can't determine order-priority: #{fow[parts][0]}, #{cc[0]}\n" when 1 parts0 = parts0.dup parts0[0..1] = [parts0[1], parts0[0]] parts = parts0.collect {|i| i[1] } when 2 cc2 = fow[parts] fow[parts] = cc cc = cc2 parts02 = rev2.delete(cc) rev2[fow[parts]] = parts0 parts0 = parts02 parts0 = parts0.dup parts0[0..1] = [parts0[1], parts0[0]] parts = parts0.collect {|i| i[1] } end end if fow.has_key?(parts) $stderr.print "too crowded: #{fow[parts][0]}, #{cc[0]}\n" end fow[parts] = cc rev2[cc] = parts0 } rev2 = rev2.collect {|cc, parts| [cc[0]] + parts.collect {|i| i[1] } + [cc[1]] + parts.collect {|i| i[0] } } rev2.sort! {|i, j| i[0] <=> j[0] } rev2.each {|i| if opt =~ /^--to-rev/ puts i[0...i.length/2].to_s elsif opt == '--to-thunder' next if i[0] == '裁' puts i[0, 3].to_s + i[i.length/2, 3].to_s end } elsif opt == '--to-chise' while gets chop! next if $_ =~ /^\s*\#/ gsub!(/\#.*$/, '') gsub!(/\s*$/, '') r = $_.split(/\s+/) l = r[0] l = l.split(/\,/) l.delete_if {|i| i =~ /^\d{4,}$/ } l[1..-1].each {|i| ali[i] = l[0].gsub(/\@90JIS$/, '') } composed = l[0].gsub(/\@90JIS$/, '') r.shift i = 0 parts = [] fparen = false while i < r.length if r[i] =~ /^([a-z\?]|\Sc)$/ if r[i] == 'l' i += 2 next elsif r[i] == '!c' || r[i] =~ /^\Sc$/ i += 1 next end c = r[i] else if r[i] =~ /^[\=\/\<\>\|]/ i += 1 next end l = r[i] fparen = true if l =~ /\((.*)\)$/ l = ' ' + l if l !~ /^[a-z\+\-]/ l = [l[0..0], l[1..-1]] parts << l if (i == r.length-1 || r[i+1] =~ /^([a-z\?]|\Sc)$/) && parts.length > 0 if c !~ /^[ab]$/ || fparen parts = [] fparen = false i += 1 next end if rev.has_key?([composed, c]) $stderr.print "repeated composition definition:\n" $stderr.print "#{[composed, c].inspect} => #{rev[[composed, c]].inspect}\n#{$_}\n" i += 1 next end rev[[composed, c]] = parts parts = [] fparen = false end end i += 1 end end def xx(c1,c2); "U+2FFB(#{c1},#{c2})"; end def lr(c1,c2); "U+2FF0(#{c1},#{c2})"; end def ud(c1,c2); "U+2FF1(#{c1},#{c2})"; end def oi(c1,c2,c0) if c0 =~ /^()$/ elsif c0 =~ /^(广|厂|やまいだれ|尸|在の外|虍|广廿|厂林|厂イ|雁たれ)$/ "U+2FF8(#{c1},#{c2})" elsif c0 =~ /^(囗四)$/ "U+2FF4(#{c1},#{c2})" elsif c0 =~ /^(門|冂|岡ひく山|風|微かまえ|戊|戌)$/ "U+2FF5(#{c1},#{c2})" elsif c0 =~ /^(勹|裁かまえ|武ひく止|)$/ "U+2FF9(#{c1},#{c2})" elsif c0 =~ /^(匚|匸)$/ "U+2FF7(#{c1},#{c2})" elsif c0 =~ /^(凵)$/ "U+2FF6(#{c1},#{c2})" elsif c0 =~ /^(之|え|廴|走|夂)$/ "U+2FFA(#{c1},#{c2})" elsif c0 =~ /^(行|衣)$/ #U+2FF3 #o->u+d, cf.'udi' "U+2FFB(#{c1},#{c2})" end end rev2 = [] rev.each {|cc, parts| parts.sort! {|i, j| %w(l r u d o i p s m x + -).index(i[0]) <=> %w(l r u d o i p s m x + -).index(j[0]) } case parts.collect {|i| i[0]}.to_s when '+-' next when 'xx' rev2 << [cc, xx(parts[0][1],parts[1][1])] when 'mm' rev2 << [cc, xx(parts[0][1],parts[1][1])] when 'lr' rev2 << [cc, lr(parts[0][1],parts[1][1])] when 'ud' rev2 << [cc, ud(parts[0][1],parts[1][1])] when 'oi' rev2 << [cc, oi(parts[0][1],parts[1][1], cc[0])] when 'lu' rev2 << [cc, "U+2FF9(#{parts[1][1]},#{parts[0][1]})"] #u->l2+r, lr(ud(l2,l),r) or #u->u2+d, ud(u2,lr(l,d)) when 'ld' # when 'ru' rev2 << [cc, "U+2FF8(#{parts[1][1]},#{parts[0][1]})"] when 'rd' rev2 << [cc, "U+2FFA(#{parts[1][1]},#{parts[0][1]})"] when 'lri' rev2 << [cc, "U+2FF2(#{parts[0][1]},#{parts[2][1]},#{parts[1][1]})"] when 'udi' rev2 << [cc, "U+2FF3(#{parts[0][1]},#{parts[2][1]},#{parts[1][1]})"] when 'lru' rev2 << [cc, ud(parts[2][1], lr(parts[0][1], parts[1][1]))] when 'lrd' rev2 << [cc, ud(lr(parts[0][1], parts[1][1]), parts[2][1])] when 'udo' rev2 << [cc, oi(parts[2][1], ud(parts[0][1], parts[1][1]))] when 'uo' #o->o2+i, oi(o2,ud(u,i)) raise notImplementedError, "#{parts.collect {|i| i[0]}.to_s}" when 'do' raise notImplementedError, "#{parts.collect {|i| i[0]}.to_s}" else raise notImplementedError, "#{parts.collect {|i| i[0]}.to_s}" end } elsif opt == '--for-tex' #http://psitau.at.infoseek.co.jp/otf.html #http://oku.edu.mie-u.ac.jp/~okumura/texwiki/?OTF puts '\documentclass{jarticle}' puts '\usepackage{utf}' puts '\begin{document}' while gets chop! gsub!(/\s\#$/, '') gsub!(/ /, ' ') gsub!(/[\#$%&_{}^~<>\\|]/) {|i| i =~ /[<>\\]/ ? '\verb|'+i+'|' : i == '|' ? '\textbar ' : '\\'+i } gsub!(/^\d{4,}/) { '{\small '+$&+'}\CID{'+$&+'}' } gsub!(/U\+([0-9A-F]{4})/) { $&+'\UTF{'+$1+'}' } puts $_ puts end puts '\end{document}' elsif false itaiji = {} fp = open('./emacs/tcode/itaiji.maz') while fp.gets chop! i = split(/\s/) itaiji[i[0]] = i[1] end fp.close while gets chop! if $_ =~ /^\s*\#/ puts $_ next end r = $_.split(/\s+/) l = r[0] l = l.split(/\,/) l.delete_if {|i| i =~ /^\d{4,}$/ } c = l[0].gsub(/\@90JIS$/, '') if itaiji[c] $_ << " #c =#{itaiji[$1]} " end puts $_ end elsif false #ftp://ftp.oreilly.com/pub/examples/nutshell/cjkv/adobe/ while gets chop! next if $_ =~ /^\#/ next if $_ =~ /^CID/ codes = $_.split(/\t/) cid = codes[0].to_i c90jis = codes[1].hex c78jis = codes[13].hex next if c78jis == 0 && c90jis == 0 w = ' ' w[0] = (c90jis>>8)+0x80 w[1] = (c90jis&0xff)+0x80 c90jis = w w = ' ' w[0] = (c78jis>>8)+0x80 w[1] = (c78jis&0xff)+0x80 c78jis = w if c90jis == c78jis printf "%-8d\#%s\n", cid, c90jis else l = [] l << c90jis+"@90JIS" if c90jis.length > 0 l << c78jis+"@78JIS" if c78jis.length > 0 printf "%-8d#%s\n", cid, l.join(",") end end else usage() end ####