EvoTalk

19 十一月, 2007

UTF-8 Encoding and Decoding

Posted by: asd In: C++| Code Snippet| 程式設計 ()

參考

改成 c 版本

C++:
  1. #include <stdio.h>
  2. #include <string.h>
  3.  
  4. void EncodeToUTF8(char * szSource, char *szFinal);
  5. void DecodeFromUTF8(char * szSource, char *szFinal);
  6.  
  7. int main(int argc, char* argv[])
  8. {
  9. char szEncodeFinal[256];
  10. char szDecodeFinal[256];
  11. EncodeToUTF8("123abc測試", szEncodeFinal);
  12. printf("Encode:%s\n", szEncodeFinal);
  13. DecodeFromUTF8(szEncodeFinal, szDecodeFinal);
  14. printf("Decode:%s\n", szDecodeFinal);
  15. return 0;
  16. }
  17.  
  18. void EncodeToUTF8(char * szSource, char *szFinal)
  19. {
  20. unsigned short ch;
  21.  
  22. unsigned char bt1, bt2, bt3, bt4, bt5, bt6;
  23.  
  24. int n, nMax = strlen(szSource);
  25.  
  26. //CString sFinal, szTemp;
  27. szFinal[0] = ('\0');
  28.  
  29. for (n = 0; n <nMax; ++n)
  30. {
  31. ch = (unsigned short)szSource[n];
  32.  
  33. if (ch == ('='))
  34. {
  35. char szTemp[256];
  36. sprintf(szTemp, ("=%02X"), ch);
  37.  
  38. strcat(szFinal, szTemp);
  39.  
  40. }
  41. else if (ch <128)
  42. {
  43. char szTemp[2];
  44. szTemp[0] = szSource[n];
  45. szTemp[1] = ('\0');
  46. strcat(szFinal, szTemp);
  47. }
  48. else if (ch <= 2047)
  49. {
  50. char szTemp[256];
  51. bt1 = (unsigned char)(192 + (ch / 64));
  52. bt2 = (unsigned char)(128 + (ch % 64));
  53.  
  54. //szTemp.Format(("=%02X=%02X"), bt1, bt2);
  55. sprintf(szTemp, ("=%02X=%02X"), bt1, bt2);
  56.  
  57. //sFinal += szTemp;
  58. strcat(szFinal, szTemp);
  59. }
  60. else if (ch <= 65535)
  61. {
  62. char szTemp[256];
  63. bt1 = (unsigned char)(224 + (ch / 4096));
  64. bt2 = (unsigned char)(128 + ((ch / 64) % 64));
  65. bt3 = (unsigned char)(128 + (ch % 64));
  66.  
  67. //szTemp.Format(("=%02X=%02X=%02X"), bt1, bt2, bt3);
  68. sprintf(szTemp, ("=%02X=%02X=%02X"), bt1, bt2, bt3);
  69.  
  70. //sFinal += szTemp;
  71. strcat(szFinal, szTemp);
  72. }
  73. else if (ch <= 2097151)
  74. {
  75. char szTemp[256];
  76. bt1 = (unsigned char)(240 + (ch / 262144));
  77. bt2 = (unsigned char)(128 + ((ch / 4096) % 64));
  78. bt3 = (unsigned char)(128 + ((ch / 64) % 64));
  79. bt4 = (unsigned char)(128 + (ch % 64));
  80.  
  81. //szTemp.Format(("=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4);
  82. //sFinal += szTemp;
  83. sprintf(szTemp, ("=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4);
  84. strcat(szFinal, szTemp);
  85. }
  86. else if (ch <=67108863)
  87. {
  88. char szTemp[256];
  89. bt1 = (unsigned char)(248 + (ch / 16777216));
  90. bt2 = (unsigned char)(128 + ((ch / 262144) % 64));
  91. bt3 = (unsigned char)(128 + ((ch / 4096) % 64));
  92. bt4 = (unsigned char)(128 + ((ch / 64) % 64));
  93. bt5 = (unsigned char)(128 + (ch % 64));
  94.  
  95. //szTemp.Format(("=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5);
  96. sprintf(szTemp, ("=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5);
  97. //sFinal += szTemp;
  98. strcat(szFinal, szTemp);
  99. }
  100. else if (ch <=2147483647)
  101. {
  102. char szTemp[256];
  103. bt1 = (unsigned char)(252 + (ch / 1073741824));
  104. bt2 = (unsigned char)(128 + ((ch / 16777216) % 64));
  105. bt3 = (unsigned char)(128 + ((ch / 262144) % 64));
  106. bt4 = (unsigned char)(128 + ((ch / 4096) % 64));
  107. bt5 = (unsigned char)(128 + ((ch / 64) % 64));
  108. bt6 = (unsigned char)(128 + (ch % 64));
  109.  
  110. //szTemp.Format(("=%02X=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5, bt6);
  111. //sFinal += szTemp;
  112. sprintf(szTemp, ("=%02X=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5, bt6);
  113. strcat(szFinal, szTemp);
  114. }
  115.  
  116. }
  117.  
  118. //return sFinal;
  119. }
  120.  
  121. unsigned char MakeByte(char ch1, char ch2);
  122.  
  123. void DecodeFromUTF8(char * szSource, char *szFinal)
  124. {
  125.  
  126. unsigned char z, y, x, w, v, u;
  127. int n, nMax = strlen(szSource);
  128. unsigned short ch;
  129.  
  130. //CString sFinal, szTemp;
  131. szFinal[0] = ('\0');
  132.  
  133. for (n = 0; n <nMax; ++n)
  134. {
  135. ch = (unsigned short)szSource[n];
  136.  
  137. if (ch != ('='))
  138. {
  139. char szTemp[2];
  140. szTemp[0] = (char)ch;
  141. szTemp[1] = ('\0');
  142. strcat(szFinal, szTemp);
  143.  
  144. //sFinal += (char)ch;
  145. continue;
  146. }
  147.  
  148. if (n>= nMax - 2) break; // something is wrong
  149. z = MakeByte(szSource[n+1], szSource[n+2]);
  150.  
  151. if (z <127)
  152. {
  153. char szTemp[2];
  154. szTemp[0] = (char)z;
  155. szTemp[1] = ('\0');
  156.  
  157. //sFinal += (char)z;
  158. strcat(szFinal, szTemp);
  159.  
  160. n = n + 2;
  161. }
  162. else if (z>= 192 && z <= 223)
  163. {
  164. // character is two unsigned chars
  165. char szTemp[2];
  166. if (n>= nMax - 5) break; // something is wrong
  167. y = MakeByte(szSource[n+4], szSource[n+5]);
  168. //sFinal += (char)( (z-192)*64 + (y-128) );
  169. szTemp[0] = (char)( (z-192)*64 + (y-128) );
  170. szTemp[1] = ('\0');
  171. strcat(szFinal, szTemp);
  172. n = n + 5;
  173. }
  174. else if (z>= 224 && z <= 239)
  175. {
  176. // character is three unsigned chars
  177. char szTemp[2];
  178. if (n>= nMax - 8) break; // something is wrong
  179. y = MakeByte(szSource[n+4], szSource[n+5]);
  180. x = MakeByte(szSource[n+7], szSource[n+8]);
  181. //sFinal += (char)( (z-224)*4096 + (y-128)*64 + (x-128) );
  182. szTemp[0] = (char)( (z-224)*4096 + (y-128)*64 + (x-128) );
  183. szTemp[1] = ('\0');
  184. strcat(szFinal, szTemp);
  185. n = n + 8;
  186. }
  187. else if (z>= 240 && z <= 247)
  188. {
  189. // character is four unsigned chars
  190. char szTemp[2];
  191. if (n>= nMax - 11) break; // something is wrong
  192. y = MakeByte(szSource[n+4], szSource[n+5]);
  193. x = MakeByte(szSource[n+7], szSource[n+8]);
  194. w = MakeByte(szSource[n+10], szSource[n+11]);
  195. //sFinal += (char)( (z-240)*262144 + (y-128)*4096 + (x-128)*64 + (w-128) );
  196. szTemp[0] = (char)( (z-240)*262144 + (y-128)*4096 + (x-128)*64 + (w-128) );
  197. szTemp[1] = ('\0');
  198. strcat(szFinal, szTemp);
  199. n = n + 11;
  200. }
  201. else if (z>= 248 && z <= 251)
  202. {
  203. // character is four unsigned chars
  204. char szTemp[2];
  205. if (n>= nMax - 14) break; // something is wrong
  206. y = MakeByte(szSource[n+4], szSource[n+5]);
  207. x = MakeByte(szSource[n+7], szSource[n+8]);
  208. w = MakeByte(szSource[n+10], szSource[n+11]);
  209. v = MakeByte(szSource[n+13], szSource[n+14]);
  210. //sFinal += (char)( (z-248)*16777216 + (y-128)*262144 + (x-128)*4096 + (w-128)*64 + (v-128) );
  211. szTemp[0] = (char)( (z-248)*16777216 + (y-128)*262144 + (x-128)*4096 + (w-128)*64 + (v-128) );
  212. szTemp[1] = ('\0');
  213. strcat(szFinal, szTemp);
  214. n = n + 14;
  215. }
  216. else if (z>= 252 && z <= 253)
  217. {
  218. // character is four unsigned chars
  219. char szTemp[2];
  220. if (n>= nMax - 17) break; // something is wrong
  221. y = MakeByte(szSource[n+4], szSource[n+5]);
  222. x = MakeByte(szSource[n+7], szSource[n+8]);
  223. w = MakeByte(szSource[n+10], szSource[n+11]);
  224. v = MakeByte(szSource[n+13], szSource[n+14]);
  225. u = MakeByte(szSource[n+16], szSource[n+17]);
  226. //sFinal += (char)( (z-252)*1073741824 + (y-128)*16777216 + (x-128)*262144 + (w-128)*4096 + (v-128)*64 + (u-128) );
  227. szTemp[0] = (char)( (z-252)*1073741824 + (y-128)*16777216 + (x-128)*262144 + (w-128)*4096 + (v-128)*64 + (u-128) );
  228. szTemp[1] = ('\0');
  229. strcat(szFinal, szTemp);
  230. n = n + 17;
  231. }
  232.  
  233. }
  234.  
  235. //return sFinal;
  236. }
  237.  
  238. // helper function for decoding
  239. unsigned char MakeByte(char ch1, char ch2)
  240. {
  241. unsigned char bt1 = 0, bt2 = 0;
  242. unsigned char btFinal;
  243. switch (ch2)
  244. {
  245. case ('0'):
  246. bt2 = 0x00;
  247. break;
  248. case ('1'):
  249. bt2 = 0x01;
  250. break;
  251. case ('2'):
  252. bt2 = 0x02;
  253. break;
  254. case ('3'):
  255. bt2 = 0x03;
  256. break;
  257. case ('4'):
  258. bt2 = 0x04;
  259. break;
  260. case ('5'):
  261. bt2 = 0x05;
  262. break;
  263. case ('6'):
  264. bt2 = 0x06;
  265. break;
  266. case ('7'):
  267. bt2 = 0x07;
  268. break;
  269. case ('8'):
  270. bt2 = 0x08;
  271. break;
  272. case ('9'):
  273. bt2 = 0x09;
  274. break;
  275. case ('A'):
  276. bt2 = 0x0A;
  277. break;
  278. case ('B'):
  279. bt2 = 0x0B;
  280. break;
  281. case ('C'):
  282. bt2 = 0x0C;
  283. break;
  284. case ('D'):
  285. bt2 = 0x0D;
  286. break;
  287. case ('E'):
  288. bt2 = 0x0E;
  289. break;
  290. case ('F'):
  291. bt2 = 0x0F;
  292. break;
  293. }
  294.  
  295. switch (ch1)
  296. {
  297. case ('0'):
  298. bt1 = 0x00;
  299. break;
  300. case ('1'):
  301. bt1 = 0x10;
  302. break;
  303. case ('2'):
  304. bt1 = 0x20;
  305. break;
  306. case ('3'):
  307. bt1 = 0x30;
  308. break;
  309. case ('4'):
  310. bt1 = 0x40;
  311. break;
  312. case ('5'):
  313. bt1 = 0x50;
  314. break;
  315. case ('6'):
  316. bt1 = 0x60;
  317. break;
  318. case ('7'):
  319. bt1 = 0x70;
  320. break;
  321. case ('8'):
  322. bt1 = 0x80;
  323. break;
  324. case ('9'):
  325. bt1 = 0x90;
  326. break;
  327. case ('A'):
  328. bt1 = 0xA0;
  329. break;
  330. case ('B'):
  331. bt1 = 0xB0;
  332. break;
  333. case ('C'):
  334. bt1 = 0xC0;
  335. break;
  336. case ('D'):
  337. bt1 = 0xD0;
  338. break;
  339. case ('E'):
  340. bt1 = 0xE0;
  341. break;
  342. case ('F'):
  343. bt1 = 0xF0;
  344. break;
  345. }
  346.  
  347. btFinal = bt2 | bt1;
  348.  
  349. return  btFinal;
  350.  
  351. }

[1]: http://www.codeproject.com/string/UTF8.asp
[2]: http://www1.tip.nl/~t876506/utf8tbl.html

Tags:

Releated Posts



No Responses to "UTF-8 Encoding and Decoding"

Comment Form


  • BK: 大於和小於在今日更廣泛地使用於標籤上,故在此補充該英文用法: : angle bracket []: square bracket
  • luh1688: 非常實用且謝謝!~
  • asd: 好的,不過很久沒修改了,不知道能不能動 寄到您的yahoo信箱
  • LIANG: nice post, thank you
  • Justmaker: 您好,請問可以跟你要source嗎?我最近有在看股票,想要enhance您的小工具,不知是否可以開放?

Category