自动辨别文本是不是utf-8的c#程序.

王朝c#·作者佚名  2006-01-09
窄屏简体版  字體: |||超大  

private void FindNoUTFFile(string Path)

{

System.IO.StreamReader reader = null;

StringBuilder sb;

StringBuilder sb2;

DirectoryInfo Folder = new System.IO.DirectoryInfo(Path);

DirectoryInfo[] subFolders = Folder.GetDirectories();

for (int i=0;i<subFolders.Length;i++)

{

FindNoUTFFile(subFolders[i].FullName);

}

FileInfo[] subFiles = Folder.GetFiles();

for(int j=0;j<subFiles.Length ;j++)

{

if(CheckFileType(subFiles[j].Extension.ToLower()))

{

FileStream fs = new FileStream(subFiles[j].FullName , FileMode.Open,FileAccess.Read);

sb = new StringBuilder();

sb2 = new StringBuilder();

bool bUtf8 =IsUTF8(fs);

fs.Close();

if (!bUtf8)

{

reader = new System.IO.StreamReader(subFiles[j].FullName,System.Text.Encoding.UTF8);

sb2.Append(reader.ReadToEnd());

reader.Close();

reader = new System.IO.StreamReader(subFiles[j].FullName, System.Text.Encoding.Default,true);

sb.Append(reader.ReadToEnd());

reader.Close();

}

}

}

}

//0000 0000-0000 007F - 0xxxxxxx (ascii converts to 1 octet!)

//0000 0080-0000 07FF - 110xxxxx 10xxxxxx ( 2 octet format)

//0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)

private static bool IsUTF8(FileStream sbInputStream)

{

int i;

byte cOctets; // octets to go in this UTF-8 encoded character

byte chr;

bool bAllAscii= true;

long iLen = sbInputStream.Length;

cOctets= 0;

for( i=0; i < iLen; i++ )

{

chr = (byte)sbInputStream.ReadByte();

if( (chr & 0x80) != 0 ) bAllAscii= false;

if( cOctets == 0 )

{

if( chr >= 0x80 )

{

do

{

chr <<= 1;

cOctets++;

}

while( (chr & 0x80) != 0 );

cOctets--;

if( cOctets == 0 ) return false;

}

}

else

{

if( (chr & 0xC0) != 0x80 )

{

return false;

}

cOctets--;

}

}

if( cOctets > 0 )

{

return false;

}

if( bAllAscii )

{

return false;

}

return true;

}

}

}

 
 
 
免责声明:本文为网络用户发布,其观点仅代表作者个人观点,与本站无关,本站仅提供信息存储服务。文中陈述内容未经本站证实,其真实性、完整性、及时性本站不作任何保证或承诺,请读者仅作参考,并请自行核实相关内容。
 
 
© 2005- 王朝網路 版權所有 導航