最近学习了下编码
以下地址可以很好的学习到相关的知识
http://dev.csdn.net/develop/article/69/69883.shtm
http://dev.csdn.net/develop/article/72/72888.shtm
其中讲了UTF8的编码
当要表示的内容是7位的时候就用一个字节:0******* 第一个0为标志位,剩下的空间正好可以表示ASCII0-127的内容。
当要表示的内容在8到11位的时候就用两个字节:110***** 10****** 第一个字节的110和第二个字节的10为标志位。
当要表示的内容在12到16位的时候就用三个字节:1110***** 10****** 10****** 和上面一样,第一个字节的1110和第二、三个字节的10都是标志位,剩下的空间正好可以表示汉字。
以此类推:
四个字节:11110**** 10****** 10****** 10******
五个字节:111110*** 10****** 10****** 10****** 10******
六个字节:1111110** 10****** 10****** 10****** 10****** 10******
.............................................
我自己写了转换的代码如下
UCS和UTF8相互转换int UCS2UTF8(wchar_t* pUCS,char* pUTF8)
{
int UCSlen = 0, UTF8len = 0, i;
char* pTempUTF8 = NULL;
UCSlen = wcslen(pUCS);
if(pUCS == NULL || pUTF8 == NULL)
return -1;
pTempUTF8 = pUTF8;
for(i = 0; i < UCSlen; i++)
{
if(pUCS[i] <= 0x007F)//1 byte 0xxxxxxx
{
*(pTempUTF8++) = LOBYTE(pUCS[i]);
UTF8len++;
}
else if(pUCS[i] <=0x07FF)//2 bytes 110xxxxx 10xxxxxx
{
*(pTempUTF8++) = HIBYTE(pUCS[i] << 2) & 0x3F | 0xC0;
*(pTempUTF8++) = LOBYTE(pUCS[i] & 0x3f) | 0x80;
UTF8len += 2;
}
else//3 bytes 1110xxxx 10xxxxxx 10xxxxxx
{
*(pTempUTF8++) = HIBYTE(pUCS[i] >> 4) | 0xe0;
*(pTempUTF8++) = HIBYTE(pUCS[i] << 2) & 0x3F | 0x80;
*(pTempUTF8++) = LOBYTE(pUCS[i]) & 0x3F | 0x80;
UTF8len += 3;
}
}
return UTF8len;
}
int UTF82UCS(char *pUTF8, wchar_t *pUCS)
{
int UCSlen = 0, i;
char *pTempUCS = NULL;
char *pTempUTF8 = NULL;
if(pUCS == NULL || pUTF8 == NULL)
return -1;
UCSlen = MultiByteToWideChar(CP_UTF8,0,pUTF8,-1,NULL,0);
UCSlen--;
pTempUCS = (char*)pUCS;
pTempUTF8 = pUTF8;
for(i = 0; i < UCSlen; i++)
{
if((*pTempUTF8) <= 0x7F) //1 byte
{
*(pTempUCS + 1) = 0x00;
*pTempUCS = *(pTempUTF8++);
pTempUCS += 2;
}
else if((*pTempUTF8) >= 0xC0 && (*pTempUTF8) <= 0xDF)//2 bytes
{
*(pTempUCS + 1) = ((*pTempUTF8) >> 2) & 0x07;
*pTempUCS = ((*pTempUTF8) << 6) | (*(pTempUTF8 + 1) & 0x3F);
pTempUTF8 += 2;
pTempUCS += 2;
}
else//3 bytes
{
*(pTempUCS + 1) = ((*pTempUTF8) << 4) | ((*(pTempUTF8 + 1) >> 2) & 0x0F);
pTempUTF8++;
*pTempUCS = ((*pTempUTF8) << 6) | (*(pTempUTF8+1) & 0x3F);
pTempUTF8 += 2;
pTempUCS += 2;
}
}
return UCSlen;
}
其他编码和UCS转换
int ToUCS(char *p,wchar_t* pUCS,int codepage)
{
int len = 0;
if(pUCS == NULL || p == NULL)
return -1;
len = MultiByteToWideChar(codepage,0,p,-1,NULL,0);
MultiByteToWideChar(codepage,0,p,-1,pUCS,len);
return len;
}
int UCSTo(wchar_t* pUCS,char *p,int codepage)
{
int len = 0;
if(pUCS == NULL || pBIG5 == NULL)
return -1;
len = WideCharToMultiByte(codepage,0,pUCS,-1,NULL,0,NULL,NULL);
WideCharToMultiByte(codepage,0,pUCS,-1,p,len,NULL,NULL);
len--;
return len;
}
这里的codepage在MSDN定义如下
BitCode pageDescriptionANSI
0
1252
Latin 1
1
1250
Latin 2: Eastern Europe
2
1251
Cyrillic
3
1253
Greek
4
1254
Turkish
5
1255
Hebrew
6
1256
Arabic
7
1257
Baltic
8
1258
VietNam
9 - 15
Reserved for ANSI
ANSI and
OEM
16
874
Thai
17
932
Japanese, Shift-JIS
18
936
Chinese: Simplified chars—PRC and Singapore
19
949
Korean Unified Hangeul Code (Hangeul TongHabHyung Code)
20
950
Chinese: Traditional chars—Hong Kong SAR, PRC and Taiwan
21
1361
Korean (Johab)
22 - 29
Reserved for alternate ANSI and OEM
30 - 31
Reserved by system.
OEM
32 - 46
Reserved for OEM
47
1258
VietNam
48
869
IBM Greek
49
866
MS-DOS Russian
50
865
MS-DOS Nordic
51
864
Arabic
52
863
MS-DOS Canadian French
53
862
Hebrew
54
861
MS-DOS Icelandic
55
860
MS-DOS Portuguese
56
857
IBM Turkish
57
855
IBM Cyrillic; primarily Russian
58
852
Latin 2
59
775
Baltic
60
737
Greek; former 437 G
61
708
Arabic; ASMO 708
62
850
Western European/Latin 1
63
437
US