Skip to content

생계형 개발자 수첩 ver4.0

이제 나이 먹어 강제로 4.0 버전업됨, 일인개발자임, 파키스탄식 코딩.. 오래된 코드도 다시 쓴다.

Tag: 인코딩

텍스트 인코딩 찾기

Posted on 2017년 4월 4일 by kimczip

가끔 개발 하다 보면 파일 입출력시 동적으로 인코딩 처리를 해야할 경우가 있다.

아래 클래스를 사용하여 이용할 경우 상당히 유용하게 쓰일 수 있다.

// -----------------------------------------------------------------------
// <copyright file="TextEncodingDetect.cs" company="AutoIt Consulting Ltd">
// Copyright (c)2014 AutoIt Consulting Ltd. All rights reserved.
// </copyright>
// -----------------------------------------------------------------------  

///////////////////////////////////////////////////////////////////////////////
//
// Copyright(c) 2014, Jonathan Bennett & AutoIt Consulting Ltd
// All rights reserved.
// http://www.autoitconsulting.com
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are met :
// *Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// * Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and / or other materials provided with the distribution.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
// ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
// DISCLAIMED.IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
// ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
// (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
// LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
// ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////////

    public class TextEncodingDetect
    {
        #region Fields

        private readonly byte[] utf16LEBOM = { 0xFF, 0xFE };
        private readonly byte[] utf16BEBOM = { 0xFE, 0xFF };
        private readonly byte[] utf8BOM = { 0xEF, 0xBB, 0xBF };

        private bool nullSuggestsBinary = true;
        private double utf16ExpectedNullPercent = 70;
        private double utf16UnexpectedNullPercent = 10;

        #endregion

        #region Enums

        public enum Encoding
        {
            None,               // Unknown or binary
            ANSI,               // 0-255
            ASCII,              // 0-127
            UTF8_BOM,           // UTF8 with BOM
            UTF8_NOBOM,         // UTF8 without BOM
            UTF16_LE_BOM,       // UTF16 LE with BOM
            UTF16_LE_NOBOM,     // UTF16 LE without BOM
            UTF16_BE_BOM,       // UTF16-BE with BOM
            UTF16_BE_NOBOM      // UTF16-BE without BOM
        }

        #endregion

        #region Properties

        public bool NullSuggestsBinary
        {
            set
            {
                this.nullSuggestsBinary = value;
            }
        }

        public double Utf16ExpectedNullPercent
        {
            set
            {
                if (value > 0 && value < 100)
                {
                    this.utf16ExpectedNullPercent = value;
                }
            }
        }

        public double Utf16UnexpectedNullPercent
        {
            set
            {
                if (value > 0 && value < 100)
                {
                    this.utf16UnexpectedNullPercent = value;
                }
            }
        }

        #endregion

        public static int GetBOMLengthFromEncodingMode(Encoding encoding)
        {
            int length = 0;

            if (encoding == Encoding.UTF16_BE_BOM || encoding == Encoding.UTF16_LE_BOM)
            {
                length = 2;
            }
            else if (encoding == Encoding.UTF8_BOM)
            {
                length = 3;
            }

            return length;
        }

        public Encoding CheckBOM(byte[] buffer, int size)
        {
            // Check for BOM
            if (size >= 2 && buffer[0] == this.utf16LEBOM[0] && buffer[1] == this.utf16LEBOM[1])
            {
                return Encoding.UTF16_LE_BOM;
            }
            else if (size >= 2 && buffer[0] == this.utf16BEBOM[0] && buffer[1] == this.utf16BEBOM[1])
            {
                return Encoding.UTF16_BE_BOM;
            }
            else if (size >= 3 && buffer[0] == this.utf8BOM[0] && buffer[1] == this.utf8BOM[1] && buffer[2] == this.utf8BOM[2])
            {
                return Encoding.UTF8_BOM;
            }
            else
            {
                return Encoding.None;
            }
        }

        public Encoding DetectEncoding(byte[] buffer, int size)
        {
            // First check if we have a BOM and return that if so
            Encoding encoding = this.CheckBOM(buffer, size);
            if (encoding != Encoding.None)
            {
                return encoding;
            }

            // Now check for valid UTF8
            encoding = this.CheckUTF8(buffer, size);
            if (encoding != Encoding.None)
            {
                return encoding;
            }

            // Now try UTF16 
            encoding = this.CheckUTF16NewlineChars(buffer, size);
            if (encoding != Encoding.None)
            {
                return encoding;
            }

            encoding = this.CheckUTF16ASCII(buffer, size);
            if (encoding != Encoding.None)
            {
                return encoding;
            }

            // ANSI or None (binary) then
            if (!this.DoesContainNulls(buffer, size))
            {
                return Encoding.ANSI;
            }
            else
            {
                // Found a null, return based on the preference in null_suggests_binary_
                if (this.nullSuggestsBinary)
                {
                    return Encoding.None;
                }
                else
                {
                    return Encoding.ANSI;
                }
            }
        }

        ///////////////////////////////////////////////////////////////////////////////
        // Checks if a buffer contains valid utf8. Returns:
        // None - not valid utf8
        // UTF8_NOBOM - valid utf8 encodings and multibyte sequences
        // ASCII - Only data in the 0-127 range. 
        ///////////////////////////////////////////////////////////////////////////////

        private Encoding CheckUTF8(byte[] buffer, int size)
        {
            // UTF8 Valid sequences
            // 0xxxxxxx  ASCII
            // 110xxxxx 10xxxxxx  2-byte
            // 1110xxxx 10xxxxxx 10xxxxxx  3-byte
            // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx  4-byte
            //
            // Width in UTF8
            // Decimal      Width
            // 0-127        1 byte
            // 194-223      2 bytes
            // 224-239      3 bytes
            // 240-244      4 bytes
            //
            // Subsequent chars are in the range 128-191
            bool only_saw_ascii_range = true;
            uint pos = 0;
            int more_chars;

            while (pos < size)
            {
                byte ch = buffer[pos++];

                if (ch == 0 && this.nullSuggestsBinary)
                {
                    return Encoding.None;
                }
                else if (ch <= 127)
                {
                    // 1 byte
                    more_chars = 0;
                }
                else if (ch >= 194 && ch <= 223)
                {
                    // 2 Byte
                    more_chars = 1;
                }
                else if (ch >= 224 && ch <= 239)
                {
                    // 3 Byte
                    more_chars = 2;
                }
                else if (ch >= 240 && ch <= 244)
                {
                    // 4 Byte
                    more_chars = 3;
                }
                else
                {
                    return Encoding.None;               // Not utf8
                }

                // Check secondary chars are in range if we are expecting any
                while (more_chars > 0 && pos < size)
                {
                    only_saw_ascii_range = false;       // Seen non-ascii chars now

                    ch = buffer[pos++];
                    if (ch < 128 || ch > 191)
                    {
                        return Encoding.None;           // Not utf8
                    }

                    --more_chars;
                }
            }

            // If we get to here then only valid UTF-8 sequences have been processed

            // If we only saw chars in the range 0-127 then we can't assume UTF8 (the caller will need to decide)
            if (only_saw_ascii_range)
            {
                return Encoding.ASCII;
            }
            else
            {
                return Encoding.UTF8_NOBOM;
            }
        }

        ///////////////////////////////////////////////////////////////////////////////
        // Checks if a buffer contains text that looks like utf16 by scanning for 
        // newline chars that would be present even in non-english text.
        // Returns:
        // None - not valid utf16
        // UTF16_LE_NOBOM - looks like utf16 le
        // UTF16_BE_NOBOM - looks like utf16 be
        ///////////////////////////////////////////////////////////////////////////////

        private Encoding CheckUTF16NewlineChars(byte[] buffer, int size)
        {
            if (size < 2)
            {
                return Encoding.None;
            }

            // Reduce size by 1 so we don't need to worry about bounds checking for pairs of bytes
            size--;

            int le_control_chars = 0;
            int be_control_chars = 0;
            byte ch1, ch2;

            uint pos = 0;
            while (pos < size)
            {
                ch1 = buffer[pos++];
                ch2 = buffer[pos++];

                if (ch1 == 0)
                {
                    if (ch2 == 0x0a || ch2 == 0x0d)
                    {
                        ++be_control_chars;
                    }
                }
                else if (ch2 == 0)
                {
                    if (ch1 == 0x0a || ch1 == 0x0d)
                    {
                        ++le_control_chars;
                    }
                }

                // If we are getting both LE and BE control chars then this file is not utf16
                if (le_control_chars > 0 && be_control_chars > 0)
                {
                    return Encoding.None;
                }
            }

            if (le_control_chars > 0)
            {
                return Encoding.UTF16_LE_NOBOM;
            }
            else if (be_control_chars > 0)
            {
                return Encoding.UTF16_BE_NOBOM;
            }
            else
            {
                return Encoding.None;
            }
        }

        ///////////////////////////////////////////////////////////////////////////////
        // Checks if a buffer contains text that looks like utf16. This is done based
        // the use of nulls which in ASCII/script like text can be useful to identify.
        // Returns:
        // None - not valid utf16
        // UTF16_LE_NOBOM - looks like utf16 le
        // UTF16_BE_NOBOM - looks like utf16 be
        ///////////////////////////////////////////////////////////////////////////////

        private Encoding CheckUTF16ASCII(byte[] buffer, int size)
        {
            int num_odd_nulls = 0;
            int num_even_nulls = 0;

            // Get even nulls
            uint pos = 0;
            while (pos < size)
            {
                if (buffer[pos] == 0)
                {
                    num_even_nulls++;
                }

                pos += 2;
            }

            // Get odd nulls
            pos = 1;
            while (pos < size)
            {
                if (buffer[pos] == 0)
                {
                    num_odd_nulls++;
                }

                pos += 2;
            }

            double even_null_threshold = (num_even_nulls * 2.0) / size;
            double odd_null_threshold = (num_odd_nulls * 2.0) / size;
            double expected_null_threshold = this.utf16ExpectedNullPercent / 100.0;
            double unexpected_null_threshold = this.utf16UnexpectedNullPercent / 100.0;

            // Lots of odd nulls, low number of even nulls
            if (even_null_threshold < unexpected_null_threshold && odd_null_threshold > expected_null_threshold)
            {
                return Encoding.UTF16_LE_NOBOM;
            }

            // Lots of even nulls, low number of odd nulls
            if (odd_null_threshold < unexpected_null_threshold && even_null_threshold > expected_null_threshold)
            {
                return Encoding.UTF16_BE_NOBOM;
            }

            // Don't know
            return Encoding.None;
        }

        ///////////////////////////////////////////////////////////////////////////////
        // Checks if a buffer contains any nulls. Used to check for binary vs text data.
        ///////////////////////////////////////////////////////////////////////////////

        private bool DoesContainNulls(byte[] buffer, int size)
        {
            uint pos = 0;
            while (pos < size)
            {
                if (buffer[pos++] == 0)
                {
                    return true;
                }
            }

            return false;
        }
    }
소스 보기
소스 숨김

아래는 활용 방법 예시

public static Stream ConvertEncodingData(string filePath)
{

    Stream result = null;

    byte[] buffer = System.IO.File.ReadAllBytes(filePath);
    // Detact Encoding 
    var textDetect = new Text.TextEncodingDetect();
    Text.TextEncodingDetect.Encoding encoding = textDetect.DetectEncoding(buffer, buffer.Length);

    Encoding findEncoding = Encoding.Default;

    switch (encoding)
    {
        case Text.TextEncodingDetect.Encoding.ASCII:
            findEncoding = Encoding.ASCII;
            break;
        case Text.TextEncodingDetect.Encoding.UTF8_BOM:
        case Text.TextEncodingDetect.Encoding.UTF8_NOBOM:
            findEncoding = Encoding.UTF8;
            break;
        case Text.TextEncodingDetect.Encoding.UTF16_LE_BOM:
        case Text.TextEncodingDetect.Encoding.UTF16_LE_NOBOM:
            findEncoding = Encoding.Unicode;
            break;
        case Text.TextEncodingDetect.Encoding.UTF16_BE_BOM:
        case Text.TextEncodingDetect.Encoding.UTF16_BE_NOBOM:
            findEncoding = Encoding.BigEndianUnicode;
            break;
        case Text.TextEncodingDetect.Encoding.None:
        default:
        case Text.TextEncodingDetect.Encoding.ANSI:
            findEncoding = Encoding.Default;
            break;
    }
    
    // Make Stream
    using (StreamReader reader = new StreamReader(filePath, findEncoding))
    {
        if (findEncoding == Encoding.UTF8)
        {
            byte[] data = reader.CurrentEncoding.GetBytes(reader.ReadToEnd());
            result = new MemoryStream(data);
        }
        else
        {
            byte[] strByte = reader.CurrentEncoding.GetBytes(reader.ReadToEnd());
            byte[] utf8Byte = Encoding.Convert(reader.CurrentEncoding, Encoding.UTF8, strByte);
            result = new MemoryStream(utf8Byte);

        }
        reader.Close();
    }

    return result;
}
소스 보기
소스 숨김

출저 : http://www.autoitconsulting.com (TextEncodingDetect class)

 

Posted in 3.0Tagged C#, 개발팁, 인코딩, 파일처리Leave a comment

버전

  • 3.0
  • 4.0

최근에..

  • 실행파일 디지털 서명하기 (프로그램 배포)
  • 자신의 PC에 막혀 있는 포트 검색
  • DB Table Column 이름 가져오기.
  • DLL 파일 PublicKeyToken 얻기
  • DSM 7.0에서 MariaDB 10 설정

Tags

10Gbps AMD Bitspower C# Command Control Corsair Crawling Exception F4-3200C14D-16GFX FLAREX G.Skill git gogs MariaDB MySQL NAS OpenCV Parallel PC-O9 rainmeter Ryzen scimitar Thermaltake UI Web WinForm 개발팁 개인사업자 광명시청 네트워크속도 데이터베이스 라이젠 랜선 랜케이블 리안리 메모리 명령프롬프트 수냉쿨링 수로 시놀로지 직구 커스텀쿨링 컴퓨터 퍼옴
Proudly powered by WordPress | Theme: micro, developed by DevriX.