如何找出文件的编码?C.*


How to find out the Encoding of a File? C#

好吧,我需要找出我在某个目录中找到的哪些文件是utf8编码的,或者是用ansi编码的,以更改我稍后决定的其他编码。我的问题是……如何确定文件是UTF8还是ANSI编码的?这两个编码实际上都可以在我的文件中设置位置。


没有可靠的方法可以做到这一点(因为该文件可能只是随机的二进制文件),但是由Windows记事本软件完成的过程在micheal s kaplan的博客中有详细介绍:

网址:http://www.siao2.com/2007/04/22/2239345.aspx

  • Check the first two bytes;
    1. If there is a UTF-16 LE BOM, then treat it (and load it) as a"Unicode" file;
    2. If there is a UTF-16 BE BOM, then treat it (and load it) as a"Unicode (Big Endian)" file;
    3. If the first two bytes look like the start of a UTF-8 BOM, then check the next byte and if we have a UTF-8 BOM, then treat it (and load it) as a"UTF-8" file;
  • Check with IsTextUnicode to see if that function think it is BOM-less UTF-16 LE, if so, then treat it (and load it) as a"Unicode" file;
  • Check to see if it UTF-8 using the original RFC 2279 definition from 1998 and if it then treat it (and load it) as a"UTF-8" file;
  • Assume an ANSI file using the default system code page of the machine.
  • Now note that there are some holes
    here, like the fact that step 2 does
    not do quite as good with BOM-less
    UTF-16 BE (there may even be a bug
    here, I'm not sure -- if so it's a bug
    in Notepad beyond any bug in
    IsTextUnicode).


    http://msdn.microsoft.com/en-us/netframework/aa569610.aspx问题2

    There is no great way to detect an
    arbitrary ANSI code page, though there
    have been some attempts to do this
    based on the probability of certain
    byte sequences in the middle of text.
    We don't try that in StreamReader. A
    few file formats like XML or HTML have
    a way of specifying the character set
    on the first line in the file, so Web
    browsers, databases, and classes like
    XmlTextReader can read these files
    correctly. But many text files don't
    have this type of information built
    in.


    unicode/utf8/unicodebigendian被认为是不同的类型。ANSI被认为与UTF8相同。

    1
    2
    3
    4
    5
    6
    7
    8
    9
    10
    11
    12
    13
    14
    15
    16
    17
    18
    19
    20
    21
    22
    23
    24
    25
    26
    27
    28
    29
    30
    31
    32
    33
    34
    35
    36
    37
    38
    39
    40
    41
    42
    43
    44
    45
    46
    47
    48
    49
    50
    51
    52
    53
    54
    55
    56
    57
    58
    59
    60
    61
    62
    63
    64
    65
    66
    67
    68
    69
    70
    71
    72
    73
    74
    75
    76
    77
    public class EncodingType
    {
        public static System.Text.Encoding GetType(string FILE_NAME)
        {
            FileStream fs = new FileStream(FILE_NAME, FileMode.Open, FileAccess.Read);
            Encoding r = GetType(fs);
            fs.Close();
            return r;
        }

        public static System.Text.Encoding GetType(FileStream fs)
        {
            byte[] Unicode = new byte[] { 0xFF, 0xFE, 0x41 };
            byte[] UnicodeBIG = new byte[] { 0xFE, 0xFF, 0x00 };
            byte[] UTF8 = new byte[] { 0xEF, 0xBB, 0xBF }; //with BOM
            Encoding reVal = Encoding.Default;

            BinaryReader r = new BinaryReader(fs, System.Text.Encoding.Default);
            int i;
            int.TryParse(fs.Length.ToString(), out i);
            byte[] ss = r.ReadBytes(i);
            if (IsUTF8Bytes(ss) || (ss[0] == 0xEF && ss[1] == 0xBB && ss[2] == 0xBF))
            {
                reVal = Encoding.UTF8;
            }
            else if (ss[0] == 0xFE && ss[1] == 0xFF && ss[2] == 0x00)
            {
                reVal = Encoding.BigEndianUnicode;
            }
            else if (ss[0] == 0xFF && ss[1] == 0xFE && ss[2] == 0x41)
            {
                reVal = Encoding.Unicode;
            }
            r.Close();
            return reVal;

        }

        private static bool IsUTF8Bytes(byte[] data)
        {
            int charByteCounter = 1; 
            byte curByte;
            for (int i = 0; i < data.Length; i++)
            {
                curByte = data[i];
                if (charByteCounter == 1)
                {
                    if (curByte >= 0x80)
                    {
                        while (((curByte <<= 1) & 0x80) != 0)
                        {
                            charByteCounter++;
                        }
                         
                        if (charByteCounter == 1 || charByteCounter > 6)
                        {
                            return false;
                        }
                    }
                }
                else
                {
                    if ((curByte & 0xC0) != 0x80)
                    {
                        return false;
                    }
                    charByteCounter--;
                }
            }
            if (charByteCounter > 1)
            {
                throw new Exception("Error byte format");
            }
            return true;
        }

    }


    请参阅这两篇代码项目文章-简单地从文件内容中找出文件编码并不容易:

    • 从byteordermarks(bom)检测编码
    • 检测输入和输出文本的编码