最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

utf 8 - Convert cp1252 to unicode in javascript - Stack Overflow

programmeradmin0浏览0评论

I need to convert cp125*2* text to unicode utf in javascript function.

Function to convert CP125*1* to utf I already find.

Please help me if you have this functionality, thanks!

I need to convert cp125*2* text to unicode utf in javascript function.

Function to convert CP125*1* to utf I already find.

Please help me if you have this functionality, thanks!

Share Improve this question asked Nov 9, 2010 at 1:51 DanielDaniel 211 silver badge3 bronze badges 2
  • Javascript strings are Unicode strings. Where are you getting that cp1252 encoded text from and in what format? a byte array? – Alex Jasmin Commented Nov 9, 2010 at 1:59
  • So, I get the raw page html just after it loaded (in my firefox addon), run the javascript function that parse this page by regexp. And to do it correctly I previously need to convert all characters to utf. I can do it correctly on the server side, but not in this task. – Daniel Commented Nov 9, 2010 at 2:06
Add a ment  | 

1 Answer 1

Reset to default 8

If ISO-8859-1 is close enough, there is a special shortcut to convert ISO-8859-1-bytes-in-code-units to Unicode characters, due to the simple byte=code-point mapping:

var chars= decodeURIComponent(escape(bytes));

For any other encoding there is no built-in functionality; you would have to include your own lookup tables. For example:

var encodings= {
    // Windows code page 1252 Western European
    //
    cp1252: '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\u20ac\ufffd\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0160\u2039\u0152\ufffd\u017d\ufffd\ufffd\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u0161\u203a\u0153\ufffd\u017e\u0178\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf\xb0\xb1\xb2\xb3\xb4\xb5\xb6\xb7\xb8\xb9\xba\xbb\xbc\xbd\xbe\xbf\xc0\xc1\xc2\xc3\xc4\xc5\xc6\xc7\xc8\xc9\xca\xcb\xcc\xcd\xce\xcf\xd0\xd1\xd2\xd3\xd4\xd5\xd6\xd7\xd8\xd9\xda\xdb\xdc\xdd\xde\xdf\xe0\xe1\xe2\xe3\xe4\xe5\xe6\xe7\xe8\xe9\xea\xeb\xec\xed\xee\xef\xf0\xf1\xf2\xf3\xf4\xf5\xf6\xf7\xf8\xf9\xfa\xfb\xfc\xfd\xfe\xff',

    // Windows code page 1251 Cyrillic
    //
    cp1251: '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\u0402\u0403\u201a\u0453\u201e\u2026\u2020\u2021\u20ac\u2030\u0409\u2039\u040a\u040c\u040b\u040f\u0452\u2018\u2019\u201c\u201d\u2022\u2013\u2014\ufffd\u2122\u0459\u203a\u045a\u045c\u045b\u045f\xa0\u040e\u045e\u0408\xa4\u0490\xa6\xa7\u0401\xa9\u0404\xab\xac\xad\xae\u0407\xb0\xb1\u0406\u0456\u0491\xb5\xb6\xb7\u0451\u2116\u0454\xbb\u0458\u0405\u0455\u0457\u0410\u0411\u0412\u0413\u0414\u0415\u0416\u0417\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u0420\u0421\u0422\u0423\u0424\u0425\u0426\u0427\u0428\u0429\u042a\u042b\u042c\u042d\u042e\u042f\u0430\u0431\u0432\u0433\u0434\u0435\u0436\u0437\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u0440\u0441\u0442\u0443\u0444\u0445\u0446\u0447\u0448\u0449\u044a\u044b\u044c\u044d\u044e\u044f'
};

function decodeBytes(bytes, encoding) {
    var enc= encodings[encoding];
    var n= bytes.length;
    var chars= new Array(n);
    for (var i= 0; i<n; i++)
        chars[i]= enc.charAt(bytes.charCodeAt(i));
    return chars.join('');
}

alert(decodeBytes('\xc7\xe4\xf0\xe0\xe2\xf1\xf2\xe2\xf3\xe9 \xec\xe8\xf0', 'cp1251'));
// '\u0417\u0434\u0440\u0430\u0432\u0441\u0442\u0432\u0443\u0439 \u043c\u0438\u0440'
// Здравствуй мир

ETA:

So, I get the raw page html just after it loaded (in my firefox addon), run the javascript function that parse this page by regexp.

Yeah, don't do that. You can't parse HTML with regex.

Why not let Firefox take care of parsing the page for its given charset?

ETA(2):

'koi8-r': '\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\u2500\u2502\u250c\u2510\u2514\u2518\u251c\u2524\u252c\u2534\u253c\u2580\u2584\u2588\u258c\u2590\u2591\u2592\u2593\u2320\u25a0\u2219\u221a\u2248\u2264\u2265\xa0\u2321\xb0\xb2\xb7\xf7\u2550\u2551\u2552\u0451\u2553\u2554\u2555\u2556\u2557\u2558\u2559\u255a\u255b\u255c\u255d\u255e\u255f\u2560\u2561\u0401\u2562\u2563\u2564\u2565\u2566\u2567\u2568\u2569\u256a\u256b\u256c\xa9\u044e\u0430\u0431\u0446\u0434\u0435\u0444\u0433\u0445\u0438\u0439\u043a\u043b\u043c\u043d\u043e\u043f\u044f\u0440\u0441\u0442\u0443\u0436\u0432\u044c\u044b\u0437\u0448\u044d\u0449\u0447\u044a\u042e\u0410\u0411\u0426\u0414\u0415\u0424\u0413\u0425\u0418\u0419\u041a\u041b\u041c\u041d\u041e\u041f\u042f\u0420\u0421\u0422\u0423\u0416\u0412\u042c\u042b\u0417\u0428\u042d\u0429\u0427\u042a'

(You can grab the mappings for single-byte encodings out of Python, saying something like:)

>>> ''.join(map(chr, range(256))).decode('koi8-r', 'replace')

I don't know how you're going about reading the input stream, but you shouldn't normally need to be doing this kind of encoding unmangling manually.

发布评论

评论列表(0)

  1. 暂无评论