| 1 |
using System; |
| 2 |
using System.Collections.Generic; |
| 3 |
using System.Linq; |
| 4 |
using System.Text; |
| 5 |
using System.IO; |
| 6 |
|
| 7 |
namespace SchoolIdolFestivalSimulator.Components { |
| 8 |
public class FileEncoding { |
| 9 |
public static Encoding GetFileEncoding(string fileName) { |
| 10 |
using(FileStream fs = new FileStream(fileName, FileMode.Open, FileAccess.Read)) { |
| 11 |
byte[] bs = new byte[fs.Length - 1]; |
| 12 |
fs.Read(bs, 0, bs.Length); |
| 13 |
fs.Close(); |
| 14 |
return GetEncoding(bs); |
| 15 |
} |
| 16 |
} |
| 17 |
/// <summary> |
| 18 |
/// 文字コードを判別する |
| 19 |
/// </summary> |
| 20 |
/// <remarks> |
| 21 |
/// Jcode.pmのgetcodeメソッドを移植したものです。 |
| 22 |
/// Jcode.pm(http://openlab.ring.gr.jp/Jcode/index-j.html) |
| 23 |
/// Jcode.pmのCopyright: Copyright 1999-2005 Dan Kogai |
| 24 |
/// </remarks> |
| 25 |
/// <param name="bytes">文字コードを調べるデータ</param> |
| 26 |
/// <returns>適当と思われるEncodingオブジェクト。 |
| 27 |
/// 判断できなかった時はnull。</returns> |
| 28 |
public static System.Text.Encoding GetEncoding(byte[] bytes) { |
| 29 |
const byte bEscape = 0x1B; |
| 30 |
const byte bAt = 0x40; |
| 31 |
const byte bDollar = 0x24; |
| 32 |
const byte bAnd = 0x26; |
| 33 |
const byte bOpen = 0x28; //'(' |
| 34 |
const byte bB = 0x42; |
| 35 |
const byte bD = 0x44; |
| 36 |
const byte bJ = 0x4A; |
| 37 |
const byte bI = 0x49; |
| 38 |
|
| 39 |
int len = bytes.Length; |
| 40 |
byte b1, b2, b3, b4; |
| 41 |
|
| 42 |
//Encode::is_utf8 は無視 |
| 43 |
bool isBinary = false; |
| 44 |
for(int i = 0;i < len;i++) { |
| 45 |
b1 = bytes[i]; |
| 46 |
if(b1 <= 0x06 || b1 == 0x7F || b1 == 0xFF) { |
| 47 |
//'binary' |
| 48 |
isBinary = true; |
| 49 |
if(b1 == 0x00 && i < len - 1 && bytes[i + 1] <= 0x7F) { |
| 50 |
//smells like raw unicode |
| 51 |
return System.Text.Encoding.Unicode; |
| 52 |
} |
| 53 |
} |
| 54 |
} |
| 55 |
if(isBinary) { |
| 56 |
return null; |
| 57 |
} |
| 58 |
|
| 59 |
//not Japanese |
| 60 |
bool notJapanese = true; |
| 61 |
for(int i = 0;i < len;i++) { |
| 62 |
b1 = bytes[i]; |
| 63 |
if(b1 == bEscape || 0x80 <= b1) { |
| 64 |
notJapanese = false; |
| 65 |
break; |
| 66 |
} |
| 67 |
} |
| 68 |
if(notJapanese) { |
| 69 |
return System.Text.Encoding.ASCII; |
| 70 |
} |
| 71 |
|
| 72 |
for(int i = 0;i < len - 2;i++) { |
| 73 |
b1 = bytes[i]; |
| 74 |
b2 = bytes[i + 1]; |
| 75 |
b3 = bytes[i + 2]; |
| 76 |
|
| 77 |
if(b1 == bEscape) { |
| 78 |
if(b2 == bDollar && b3 == bAt) { |
| 79 |
//JIS_0208 1978 |
| 80 |
//JIS |
| 81 |
return System.Text.Encoding.GetEncoding(50220); |
| 82 |
} else if(b2 == bDollar && b3 == bB) { |
| 83 |
//JIS_0208 1983 |
| 84 |
//JIS |
| 85 |
return System.Text.Encoding.GetEncoding(50220); |
| 86 |
} else if(b2 == bOpen && (b3 == bB || b3 == bJ)) { |
| 87 |
//JIS_ASC |
| 88 |
//JIS |
| 89 |
return System.Text.Encoding.GetEncoding(50220); |
| 90 |
} else if(b2 == bOpen && b3 == bI) { |
| 91 |
//JIS_KANA |
| 92 |
//JIS |
| 93 |
return System.Text.Encoding.GetEncoding(50220); |
| 94 |
} |
| 95 |
if(i < len - 3) { |
| 96 |
b4 = bytes[i + 3]; |
| 97 |
if(b2 == bDollar && b3 == bOpen && b4 == bD) { |
| 98 |
//JIS_0212 |
| 99 |
//JIS |
| 100 |
return System.Text.Encoding.GetEncoding(50220); |
| 101 |
} |
| 102 |
if(i < len - 5 && |
| 103 |
b2 == bAnd && b3 == bAt && b4 == bEscape && |
| 104 |
bytes[i + 4] == bDollar && bytes[i + 5] == bB) { |
| 105 |
//JIS_0208 1990 |
| 106 |
//JIS |
| 107 |
return System.Text.Encoding.GetEncoding(50220); |
| 108 |
} |
| 109 |
} |
| 110 |
} |
| 111 |
} |
| 112 |
|
| 113 |
//should be euc|sjis|utf8 |
| 114 |
//use of (?:) by Hiroki Ohzaki <ohzaki@iod.ricoh.co.jp> |
| 115 |
int sjis = 0; |
| 116 |
int euc = 0; |
| 117 |
int utf8 = 0; |
| 118 |
for(int i = 0;i < len - 1;i++) { |
| 119 |
b1 = bytes[i]; |
| 120 |
b2 = bytes[i + 1]; |
| 121 |
if(((0x81 <= b1 && b1 <= 0x9F) || (0xE0 <= b1 && b1 <= 0xFC)) && |
| 122 |
((0x40 <= b2 && b2 <= 0x7E) || (0x80 <= b2 && b2 <= 0xFC))) { |
| 123 |
//SJIS_C |
| 124 |
sjis += 2; |
| 125 |
i++; |
| 126 |
} |
| 127 |
} |
| 128 |
for(int i = 0;i < len - 1;i++) { |
| 129 |
b1 = bytes[i]; |
| 130 |
b2 = bytes[i + 1]; |
| 131 |
if(((0xA1 <= b1 && b1 <= 0xFE) && (0xA1 <= b2 && b2 <= 0xFE)) || |
| 132 |
(b1 == 0x8E && (0xA1 <= b2 && b2 <= 0xDF))) { |
| 133 |
//EUC_C |
| 134 |
//EUC_KANA |
| 135 |
euc += 2; |
| 136 |
i++; |
| 137 |
} else if(i < len - 2) { |
| 138 |
b3 = bytes[i + 2]; |
| 139 |
if(b1 == 0x8F && (0xA1 <= b2 && b2 <= 0xFE) && |
| 140 |
(0xA1 <= b3 && b3 <= 0xFE)) { |
| 141 |
//EUC_0212 |
| 142 |
euc += 3; |
| 143 |
i += 2; |
| 144 |
} |
| 145 |
} |
| 146 |
} |
| 147 |
for(int i = 0;i < len - 1;i++) { |
| 148 |
b1 = bytes[i]; |
| 149 |
b2 = bytes[i + 1]; |
| 150 |
if((0xC0 <= b1 && b1 <= 0xDF) && (0x80 <= b2 && b2 <= 0xBF)) { |
| 151 |
//UTF8 |
| 152 |
utf8 += 2; |
| 153 |
i++; |
| 154 |
} else if(i < len - 2) { |
| 155 |
b3 = bytes[i + 2]; |
| 156 |
if((0xE0 <= b1 && b1 <= 0xEF) && (0x80 <= b2 && b2 <= 0xBF) && |
| 157 |
(0x80 <= b3 && b3 <= 0xBF)) { |
| 158 |
//UTF8 |
| 159 |
utf8 += 3; |
| 160 |
i += 2; |
| 161 |
} |
| 162 |
} |
| 163 |
} |
| 164 |
//M. Takahashi's suggestion |
| 165 |
//utf8 += utf8 / 2; |
| 166 |
|
| 167 |
System.Diagnostics.Debug.WriteLine( |
| 168 |
string.Format("sjis = {0}, euc = {1}, utf8 = {2}", sjis, euc, utf8)); |
| 169 |
if(euc > sjis && euc > utf8) { |
| 170 |
//EUC |
| 171 |
return System.Text.Encoding.GetEncoding(51932); |
| 172 |
} else if(sjis > euc && sjis > utf8) { |
| 173 |
//SJIS |
| 174 |
return System.Text.Encoding.GetEncoding(932); |
| 175 |
} else if(utf8 > euc && utf8 > sjis) { |
| 176 |
//UTF8 |
| 177 |
return System.Text.Encoding.UTF8; |
| 178 |
} |
| 179 |
return null; |
| 180 |
} |
| 181 |
} |
| 182 |
} |