RubyとPythonで全角文字を半角文字2文字として数えるその2

前回のRubyのソースでは、ASCII文字と半角カナかどうかを半角文字の判定に使用していた。今回はPythonと同様に、Unicodeの定義から判定できるようにクラスを作成した。
ついでに右寄せと中央寄せも追加した。

Singletonで最初に使用したときに、Unicode定義ファイルを読み込む。
定義リストの検索は2分探索で行う。

実行には以下のURLにある EastAsianWidth.txt が必要。
http://www.unicode.org/reports/tr41/
http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt

追記: 速度が遅かったためbsearchから文字列処理を外に出して高速化

Ruby 1.8.6
unicodedata.rb

require "singleton"

# UnicodeDataクラス(Singleton)
#   メソッドはeast_asian_widthしかない
#   以下のURLにあるEastAsianWidth.txtが必要
#   http://www.unicode.org/Public/UNIDATA/EastAsianWidth.txt
class Unicodedata
    include Singleton

    # コンストラクタ
    def initialize()
        # 定義ファイル読み込み
        f = open("EastAsianWidth.txt", "r")
        lines = f.readlines
        f.close

        @list = Array.new
        for line in lines
            s_code, e_code, eaw = get_code(line)
            next unless s_code and e_code and eaw

            # 同じ定義が連続した時は範囲を連結
            if @list[-1] and s_code == (@list[-1][1] + 1) and eaw == @list[-1][2]
                @list[-1][1] = e_code       # 連結
            else
                @list << [s_code, e_code, eaw]      # 新規
            end
        end
    end

    # EastAsianWidth取得
    def Unicodedata.east_asian_width(character)
        uni = Unicodedata.instance
        return uni.east_asian_width(character)
    end

    # EastAsianWidth取得
    def east_asian_width(character)
        find_code = character.unpack("U")[0].to_i   # 検索するUnicode番号
        return bsearch(find_code, @list, 0, @list.size - 1)
    end

    # 2分探索(再帰)
    # 見つからなかったら"N"を返す
    def bsearch(target, list, min, max)
        return "N" if min > max     # 発見できず

        index = (min + max) / 2
        s_code, e_code, eaw = list[index]    # 定義取得

        # Unicode番号を比較して不一致なら再帰
        if s_code > target    # もっと前
            return bsearch(target, list, min, index-1)
        elsif e_code < target    # もっと後ろ
            return bsearch(target, list, index+1, max)
        end
        # Unicode番号 一致
        return eaw
    end
    private :bsearch

    # Unicode番号とEastAsianWidthを取得
    def get_code(line)
        line.chomp!
        line = line.gsub(/\s*#.*/, "")     # コメント削除
        return unless /.+;.+/ =~ line      # データ行ではない

        code_n_eaw = line.split(/\s*;\s*/)
        codes = code_n_eaw[0].split(/\s*\.\.\s*/)

        s_code = codes[0].hex     # Unicode番号
        e_code = codes[1] ? codes[1].hex : s_code       # 範囲指定なら終了番号
        eaw = code_n_eaw[1] ? code_n_eaw[1].strip : nil     # EastAsianWidth

        return s_code, e_code, eaw
    end
    private :get_code
end

# あらかじめ読み込んでおく
Unicodedata.instance

require "unicodedata"

# 左寄せ
def ljust_kana(str, size, pad = " ")
    space = size - width_kana(str)
    if space > 0
        str = str + pad * space
    end
    return str
end

# 右寄せ
def rjust_kana(str, size, pad = " ")
    space = size - width_kana(str)
    if space > 0
        str = pad * space + str
    end
    return str
end

# 中央寄せ
def center_kana(str, size, pad = " ")
    space = size - width_kana(str)
    if space > 0
        str = pad * (space / 2.0).truncate + str + pad * (space / 2.0).ceil
    end
    return str
end

# 幅(半角基準)
def width_kana(str)
    all = str.split(//u).size       # 全文字数
    zenkaku = count_zen(str)        # 全角文字数
    hankaku = all - zenkaku     # 半角文字数
    
    return zenkaku * 2 + hankaku
end

# 全角文字数
def count_zen(str)
    n = 0
    for c in str.split(//u)
        wide_chars = "WFA"
        eaw = Unicodedata.east_asian_width(c)
        if wide_chars.include?(eaw)
            n += 1
        end
    end
    return n   
end

# main
puts ljust_kana("日本語", 20) + "|"
puts ljust_kana("ﾆﾎﾝｺﾞ", 20) + "|"
puts ljust_kana("nihongo", 20) + "|"
puts ljust_kana("niﾎﾝ語", 20) + "|"

puts center_kana("日本語", 20) + "|"
puts center_kana("ﾆﾎﾝｺﾞ", 20) + "|"
puts center_kana("nihongo", 20) + "|"
puts center_kana("niﾎﾝ語", 20) + "|"

puts rjust_kana("日本語", 20) + "|"
puts rjust_kana("ﾆﾎﾝｺﾞ", 20) + "|"
puts rjust_kana("nihongo", 20) + "|"
puts rjust_kana("niﾎﾝ語", 20) + "|"

Python 2.5.1

#!/usr/bin/env python
# -*- coding: utf-8 -*-

import math
import unicodedata

# 左寄せ
def ljust_kana(str, size, pad = " "):
    space = size - width_kana(str)
    if space > 0:
        str = str + pad * space
    return str
    
# 右寄せ
def rjust_kana(str, size, pad = " "):
    space = size - width_kana(str)
    if space > 0:
        str = pad * space + str
    return str
    
# 中央寄せ
def center_kana(str, size, pad = " "):
    space = size - width_kana(str)
    if space > 0:
        str = pad * int(math.floor(space / 2.0)) + str + pad * int(math.ceil(space / 2.0))
    return str
    
# 幅(半角基準)
def width_kana(str):
    all = len(str)      # 全文字数
    zenkaku = count_zen(str)        # 全角文字数
    hankaku = all - zenkaku     # 半角文字数
    
    return zenkaku * 2 + hankaku

# 全角文字数
def  count_zen(str):
    n = 0
    for c in str:
        wide_chars = u"WFA"
        eaw = unicodedata.east_asian_width(c)
        if wide_chars.find(eaw) > -1:
            n += 1
    return n
    
# main
if __name__ == "__main__":
    print ljust_kana(u"日本語", 20) + "|"
    print ljust_kana(u"ﾆﾎﾝｺﾞ", 20) + "|"
    print ljust_kana(u"nihongo", 20) + "|"
    print ljust_kana(u"niﾎﾝ語", 20) + "|"
    
    print center_kana(u"日本語", 20) + "|"
    print center_kana(u"ﾆﾎﾝｺﾞ", 20) + "|"
    print center_kana(u"nihongo", 20) + "|"
    print center_kana(u"niﾎﾝ語", 20) + "|"
    
    print rjust_kana(u"日本語", 20) + "|"
    print rjust_kana(u"ﾆﾎﾝｺﾞ", 20) + "|"
    print rjust_kana(u"nihongo", 20) + "|"
    print rjust_kana(u"niﾎﾝ語", 20) + "|"

実行結果

日本語              |
ﾆﾎﾝｺﾞ               |
nihongo             |
niﾎﾝ語              |
       日本語       |
       ﾆﾎﾝｺﾞ        |
      nihongo       |
       niﾎﾝ語       |
              日本語|
               ﾆﾎﾝｺﾞ|
             nihongo|
              niﾎﾝ語|

実行環境 Mac OSX 10.5.6 Leopard