SSJB's blog

いろいろです。

康煕部首をPerlで変換する

Webサイト制作時に、いつの間にかに康煕部首が紛れていることがあります。
illustratroからテキストをコピペで持ってくるとたまになってしまうことがあり、明確な原因と対策がわからないので、一気に変換してしまうことにしました。
楽な方法がわからなかったので、Perlでベタ書きです。
(一緒にNFC/NFD対策も入れています)

use strict;
use warnings;
use feature qw/say/;
use Encode;
use Encode::UTF8Mac;

my $kangxi_radicals = {
    'U+2F00' => {'⼀' => '一'},
    'U+2F01' => {'⼁' => '丨'},
    'U+2F02' => {'⼂' => '丶'},
    'U+2F03' => {'⼃' => '丿'},
    'U+2F04' => {'⼄' => '乙'},
    'U+2F05' => {'⼅' => '亅'},
    'U+2F06' => {'⼆' => '二'},
    'U+2F07' => {'⼇' => '亠'},
    'U+2F08' => {'⼈' => '人'},
    'U+2F09' => {'⼉' => '儿'},
    'U+2F0A' => {'⼊' => '入'},
    'U+2F0B' => {'⼋' => '八'},
    'U+2F0C' => {'⼌' => '冂'},
    'U+2F0D' => {'⼍' => '冖'},
    'U+2F0E' => {'⼎' => '冫'},
    'U+2F0F' => {'⼏' => '几'},
    'U+2F10' => {'⼐' => '凵'},
    'U+2F11' => {'⼑' => '刀'},
    'U+2F12' => {'⼒' => '力'},
    'U+2F13' => {'⼓' => '勹'},
    'U+2F14' => {'⼔' => '匕'},
    'U+2F15' => {'⼕' => '匚'},
    'U+2F16' => {'⼖' => '匸'},
    'U+2F17' => {'⼗' => '十'},
    'U+2F18' => {'⼘' => '卜'},
    'U+2F19' => {'⼙' => '卩'},
    'U+2F1A' => {'⼚' => '厂'},
    'U+2F1B' => {'⼛' => '厶'},
    'U+2F1C' => {'⼜' => '又'},
    'U+2F1D' => {'⼝' => '口'},
    'U+2F1E' => {'⼞' => '囗'},
    'U+2F1F' => {'⼟' => '土'},
    'U+2F20' => {'⼠' => '士'},
    'U+2F21' => {'⼡' => '夂'},
    'U+2F22' => {'⼢' => '夊'},
    'U+2F23' => {'⼣' => '夕'},
    'U+2F24' => {'⼤' => '大'},
    'U+2F25' => {'⼥' => '女'},
    'U+2F26' => {'⼦' => '子'},
    'U+2F27' => {'⼧' => '宀'},
    'U+2F28' => {'⼨' => '寸'},
    'U+2F29' => {'⼩' => '小'},
    'U+2F2A' => {'⼪' => '尢'},
    'U+2F2B' => {'⼫' => '尸'},
    'U+2F2C' => {'⼬' => '屮'},
    'U+2F2D' => {'⼭' => '山'},
    'U+2F2E' => {'⼮' => '巛'},
    'U+2F2F' => {'⼯' => '工'},
    'U+2F30' => {'⼰' => '己'},
    'U+2F31' => {'⼱' => '巾'},
    'U+2F32' => {'⼲' => '干'},
    'U+2F33' => {'⼳' => '幺'},
    'U+2F34' => {'⼴' => '广'},
    'U+2F35' => {'⼵' => '廴'},
    'U+2F36' => {'⼶' => '廾'},
    'U+2F37' => {'⼷' => '弋'},
    'U+2F38' => {'⼸' => '弓'},
    'U+2F39' => {'⼹' => '彐'},
    'U+2F3A' => {'⼺' => '彡'},
    'U+2F3B' => {'⼻' => '彳'},
    'U+2F3C' => {'⼼' => '心'},
    'U+2F3D' => {'⼽' => '戈'},
    'U+2F3E' => {'⼾' => '戸'},
    'U+2F3F' => {'⼿' => '手'},
    'U+2F40' => {'⽀' => '支'},
    'U+2F41' => {'⽁' => '攴'},
    'U+2F42' => {'⽂' => '文'},
    'U+2F43' => {'⽃' => '斗'},
    'U+2F44' => {'⽄' => '斤'},
    'U+2F45' => {'⽅' => '方'},
    'U+2F46' => {'⽆' => '无'},
    'U+2F47' => {'⽇' => '日'},
    'U+2F48' => {'⽈' => '曰'},
    'U+2F49' => {'⽉' => '月'},
    'U+2F4A' => {'⽊' => '木'},
    'U+2F4B' => {'⽋' => '欠'},
    'U+2F4C' => {'⽌' => '止'},
    'U+2F4D' => {'⽍' => '歹'},
    'U+2F4E' => {'⽎' => '殳'},
    'U+2F4F' => {'⽏' => '毋'},
    'U+2F50' => {'⽐' => '比'},
    'U+2F51' => {'⽑' => '毛'},
    'U+2F52' => {'⽒' => '氏'},
    'U+2F53' => {'⽓' => '气'},
    'U+2F54' => {'⽔' => '水'},
    'U+2F55' => {'⽕' => '火'},
    'U+2F56' => {'⽖' => '爪'},
    'U+2F57' => {'⽗' => '父'},
    'U+2F58' => {'⽘' => '爻'},
    'U+2F59' => {'⽙' => '爿'},
    'U+2F5A' => {'⽚' => '片'},
    'U+2F5B' => {'⽛' => '牙'},
    'U+2F5C' => {'⽜' => '牛'},
    'U+2F5D' => {'⽝' => '犬'},
    'U+2F5E' => {'⽞' => '玄'},
    'U+2F5F' => {'⽟' => '玉'},
    'U+2F60' => {'⽠' => '瓜'},
    'U+2F61' => {'⽡' => '瓦'},
    'U+2F62' => {'⽢' => '甘'},
    'U+2F63' => {'⽣' => '生'},
    'U+2F64' => {'⽤' => '用'},
    'U+2F65' => {'⽥' => '田'},
    'U+2F66' => {'⽦' => '疋'},
    'U+2F67' => {'⽧' => '疒'},
    'U+2F68' => {'⽨' => '癶'},
    'U+2F69' => {'⽩' => '白'},
    'U+2F6A' => {'⽪' => '皮'},
    'U+2F6B' => {'⽫' => '皿'},
    'U+2F6C' => {'⽬' => '目'},
    'U+2F6D' => {'⽭' => '矛'},
    'U+2F6E' => {'⽮' => '矢'},
    'U+2F6F' => {'⽯' => '石'},
    'U+2F70' => {'⽰' => '示'},
    'U+2F71' => {'⽱' => '禸'},
    'U+2F72' => {'⽲' => '禾'},
    'U+2F73' => {'⽳' => '穴'},
    'U+2F74' => {'⽴' => '立'},
    'U+2F75' => {'⽵' => '竹'},
    'U+2F76' => {'⽶' => '米'},
    'U+2F77' => {'⽷' => '糸'},
    'U+2F78' => {'⽸' => '缶'},
    'U+2F79' => {'⽹' => '网'},
    'U+2F7A' => {'⽺' => '羊'},
    'U+2F7B' => {'⽻' => '羽'},
    'U+2F7C' => {'⽼' => '老'},
    'U+2F7D' => {'⽽' => '而'},
    'U+2F7E' => {'⽾' => '耒'},
    'U+2F7F' => {'⽿' => '耳'},
    'U+2F80' => {'⾀' => '聿'},
    'U+2F81' => {'⾁' => '肉'},
    'U+2F82' => {'⾂' => '臣'},
    'U+2F83' => {'⾃' => '自'},
    'U+2F84' => {'⾄' => '至'},
    'U+2F85' => {'⾅' => '臼'},
    'U+2F86' => {'⾆' => '舌'},
    'U+2F87' => {'⾇' => '舛'},
    'U+2F88' => {'⾈' => '舟'},
    'U+2F89' => {'⾉' => '艮'},
    'U+2F8A' => {'⾊' => '色'},
    'U+2F8B' => {'⾋' => '艸'},
    'U+2F8C' => {'⾌' => '虍'},
    'U+2F8D' => {'⾍' => '虫'},
    'U+2F8E' => {'⾎' => '血'},
    'U+2F8F' => {'⾏' => '行'},
    'U+2F90' => {'⾐' => '衣'},
    'U+2F91' => {'⾑' => '襾'},
    'U+2F92' => {'⾒' => '見'},
    'U+2F93' => {'⾓' => '角'},
    'U+2F94' => {'⾔' => '言'},
    'U+2F95' => {'⾕' => '谷'},
    'U+2F96' => {'⾖' => '豆'},
    'U+2F97' => {'⾗' => '豕'},
    'U+2F98' => {'⾘' => '豸'},
    'U+2F99' => {'⾙' => '貝'},
    'U+2F9A' => {'⾚' => '赤'},
    'U+2F9B' => {'⾛' => '走'},
    'U+2F9C' => {'⾜' => '足'},
    'U+2F9D' => {'⾝' => '身'},
    'U+2F9E' => {'⾞' => '車'},
    'U+2F9F' => {'⾟' => '辛'},
    'U+2FA0' => {'⾠' => '辰'},
    'U+2FA1' => {'⾡' => '辵'},
    'U+2FA2' => {'⾢' => '邑'},
    'U+2FA3' => {'⾣' => '酉'},
    'U+2FA4' => {'⾤' => '釆'},
    'U+2FA5' => {'⾥' => '里'},
    'U+2FA6' => {'⾦' => '金'},
    'U+2FA7' => {'⾧' => '長'},
    'U+2FA8' => {'⾨' => '門'},
    'U+2FA9' => {'⾩' => '阜'},
    'U+2FAA' => {'⾪' => '隶'},
    'U+2FAB' => {'⾫' => '隹'},
    'U+2FAC' => {'⾬' => '雨'},
    'U+2FAD' => {'⾭' => '菁'},
    'U+2FAE' => {'⾮' => '非'},
    'U+2FAF' => {'⾯' => '面'},
    'U+2FB0' => {'⾰' => '革'},
    'U+2FB1' => {'⾱' => '韋'},
    'U+2FB2' => {'⾲' => '韭'},
    'U+2FB3' => {'⾳' => '音'},
    'U+2FB4' => {'⾴' => '頁'},
    'U+2FB5' => {'⾵' => '風'},
    'U+2FB6' => {'⾶' => '飛'},
    'U+2FB7' => {'⾷' => '食'},
    'U+2FB8' => {'⾸' => '首'},
    'U+2FB9' => {'⾹' => '香'},
    'U+2FBA' => {'⾺' => '馬'},
    'U+2FBB' => {'⾻' => '骨'},
    'U+2FBC' => {'⾼' => '高'},
    'U+2FBD' => {'⾽' => '髟'},
    'U+2FBE' => {'⾾' => '鬥'},
    'U+2FBF' => {'⾿' => '鬯'},
    'U+2FC0' => {'⿀' => '鬲'},
    'U+2FC1' => {'⿁' => '鬼'},
    'U+2FC2' => {'⿂' => '魚'},
    'U+2FC3' => {'⿃' => '鳥'},
    'U+2FC4' => {'⿄' => '鹵'},
    'U+2FC5' => {'⿅' => '鹿'},
    'U+2FC6' => {'⿆' => '麥'},
    'U+2FC7' => {'⿇' => '麻'},
    'U+2FC8' => {'⿈' => '黄'},
    'U+2FC9' => {'⿉' => '黍'},
    'U+2FCA' => {'⿊' => '黒'},
    'U+2FCB' => {'⿋' => '黹'},
    'U+2FCC' => {'⿌' => '黽'},
    'U+2FCD' => {'⿍' => '鼎'},
    'U+2FCE' => {'⿎' => '鼓'},
    'U+2FCF' => {'⿏' => '鼠'},
    'U+2FD0' => {'⿐' => '鼻'},
    'U+2FD1' => {'⿑' => '齊'},
    'U+2FD2' => {'⿒' => '齒'},
    'U+2FD3' => {'⿓' => '龍'},
    'U+2FD4' => {'⿔' => '龜'},
    'U+2FD5' => {'⿕' => '龠'}
};

open(INPUT, '<', $file_name) or die "Can't open $file_name: $!";
open(OUTPUT, '>', $output) or die "Can't open $output: $!";

while(my $row = <INPUT>) {

    $row =~ s/[\r\n]+$//;

    $row =  Encode::encode('utf-8', Encode::decode('utf-8-mac', $row));

    foreach (sort keys %$kangxi_radicals) {
        foreach my $k (sort keys %{$kangxi_radicals->{$_}}) {
            my $v = $kangxi_radicals->{$_}{$k};
            $row =~ s/$k/$v/g;
        }
    }

    say OUTPUT $row;
}