VB 6.0中的文本文件编码

前端之家收集整理的这篇文章主要介绍了VB 6.0中的文本文件编码前端之家小编觉得挺不错的,现在分享给大家,也给大家做个参考。
我有大量的外部文件,其中包含“ANSI”和“UCS-2 Little Endian”编码格式.

现在我想使用Visual Basic 6.0将文件编码格式更改为UTF-8.我不想修改原始文件;我只想单独更改编码格式.

我在网上搜索过;但无法理解VB代码,也不知道该怎么做.

我还希望能够从UTF-8编码文件中一次读取一行.

注意.这个答案已经广泛扩展,以适应编辑后的问题,而这个问题又归结为 Visual Basic 6 and UTF-8

以下代码包含在VB6中将ANSI,UTF-16和UTF-32编码的字符串从文件转换为UTF-8字符串.您必须加载整个文件输出它.请注意,如果它是真正通用的,LineInputUTF8()方法将是LineInput(),并需要一个代码页.

Option Explicit

Private Declare Function MultiByteToWideChar Lib "Kernel32.dll" ( _
    ByVal CodePage As Long,_
    ByVal dwFlags As Long,_
    ByVal lpMultiByteStr As Long,_
    ByVal cbMultiByte As Long,_
    ByVal lpWideCharStr As Long,_
    ByVal cchWideChar As Long _
) As Long

Private Declare Function WideCharToMultiByte Lib "Kernel32.dll" ( _
    ByVal CodePage As Long,_
    ByVal cchWideChar As Long,_
    ByVal lpDefaultChar As Long,_
    ByVal lpUsedDefaultChar As Long _
) As Long

Public Const CP_ACP        As Long = 0          ' Default ANSI code page.
Public Const CP_UTF8       As Long = 65001      ' UTF8.
Public Const CP_UTF16_LE   As Long = 1200       ' UTF16 - little endian.
Public Const CP_UTF16_BE   As Long = 1201       ' UTF16 - big endian.
Public Const CP_UTF32_LE   As Long = 12000      ' UTF32 - little endian.
Public Const CP_UTF32_BE   As Long = 12001      ' UTF32 - big endian.

' Purpose:  Heuristic to determine whether bytes in a file are UTF-8.
Private Function FileBytesAreUTF8(ByVal the_iFileNo As Integer) As Boolean

    Const knSampleByteSize          As Long = 2048
    Dim nLof                        As Long
    Dim nByteCount                  As Long
    Dim nByteIndex                  As Long
    Dim nCharExtraByteCount         As Long
    Dim bytValue                    As Byte

    ' We look at the first <knSampleByteSize> bytes of the file. However,if the file is smaller,we will have to
    ' use the smaller size.
    nLof = LOF(the_iFileNo)
    If nLof < knSampleByteSize Then
        nByteCount = nLof
    Else
        nByteCount = knSampleByteSize
    End If

    ' Go to the start of the file.
    Seek #the_iFileNo,1

    For nByteIndex = 1 To nByteCount

        Get #the_iFileNo,bytValue

        ' If the character we are processing has bytes beyond 1,then we are onto the next character.
        If nCharExtraByteCount = 0 Then
            '
            ' The UTF-8 specification says that the first byte of a character has masking bits which indicate how many bytes follow.
            '
            ' See: http://en.wikipedia.org/wiki/UTF-8#Description
            '
            ' Bytes in
            ' sequence   Byte 1   Byte 2   Byte 3   Byte 4
            ' 1          0xxxxxxx
            ' 2          110xxxxx 10xxxxxx
            ' 3          1110xxxx 10xxxxxx 10xxxxxx
            ' 4          11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
            '
            If (bytValue And &H80) = &H0 Then
                nCharExtraByteCount = 0
            ElseIf (bytValue And &HE0) = &HC0 Then
                nCharExtraByteCount = 1
            ElseIf (bytValue And &HF0) = &HE0 Then
                nCharExtraByteCount = 2
            ElseIf (bytValue And &HF8) = &HF0 Then
                nCharExtraByteCount = 3
            Else
                ' If none of these masks were matched,then this can't be a UTF-8 character.
                FileBytesAreUTF8 = False
                Exit Function
            End If
        Else
            ' All following bytes must be masked as in the table above.
            If (bytValue And &HC0) = &H80 Then
                nCharExtraByteCount = nCharExtraByteCount - 1
                If nCharExtraByteCount = 0 Then
                    FileBytesAreUTF8 = True
                End If
            Else
                ' Not a UTF8 character.
                FileBytesAreUTF8 = False
                Exit Function
            End If
        End If

    Next nByteIndex

End Function

' Purpose:  Take a string whose bytes are in the byte array <the_abytCPString>,with code page <the_nCodePage>,convert to a VB string.
Private Function FromCPString(ByRef the_abytCPString() As Byte,ByVal the_nCodePage As Long) As String

    Dim sOutput                     As String
    Dim nValueLen                   As Long
    Dim nOutputCharLen              As Long

    ' If the code page says this is already compatible with the VB string,then just copy it into the string. No messing.
    If the_nCodePage = CP_UTF16_LE Then
        FromCPString = the_abytCPString()
    Else

        ' Cache the input length.
        nValueLen = UBound(the_abytCPString) - LBound(the_abytCPString) + 1

        ' See how big the output buffer will be.
        nOutputCharLen = MultiByteToWideChar(the_nCodePage,0&,VarPtr(the_abytCPString(LBound(the_abytCPString))),nValueLen,0&)

        ' Resize output byte array to the size of the UTF-8 string.
        sOutput = Space$(nOutputCharLen)

        ' Make this API call again,this time giving a pointer to the output byte array.
        MultiByteToWideChar the_nCodePage,StrPtr(sOutput),nOutputCharLen

        ' Return the array.
        FromCPString = sOutput

    End If

End Function

Public Function GetContents(ByVal the_sTextFile As String,ByRef out_nCodePage As Long,Optional ByVal the_nDesiredCodePage As Long = -1,Optional ByRef out_bContainedBOM As Boolean) As String

    Dim iFileNo                     As Integer
    Dim abytFileContents()          As Byte
    Dim nDataSize                   As Long

    iFileNo = FreeFile

    OpenForInput the_sTextFile,iFileNo,out_nCodePage,the_nDesiredCodePage,out_bContainedBOM

    ' We want to read the entire contents of the file (not including any BOM value).
    ' After calling OpenForInput(),the file pointer should be positioned after any BOM.
    ' So size file contents buffer to <file size> - <current position> + 1.
    nDataSize = LOF(iFileNo) - Seek(iFileNo) + 1
    ReDim abytFileContents(1 To nDataSize)
    Get #iFileNo,abytFileContents()

    Close iFileNo

    ' Now we must convert this to UTF-8. But we have to first convert to the Windows NT standard UTF-16 LE.
    GetContents = FromCPString(abytFileContents(),out_nCodePage)

End Function

' Purpose:  Reads up to the end of the current line of the file,repositions to the beginning of the next line,if any,and
'           outputs all characters found.
' Inputs:   the_nFileNo     The number of the file.
' Outputs:  out_sLine       The line from the current position in the file.
' Return:   True if there is more data.
Public Function LineInputUTF8(ByVal the_nFileNo As Integer,ByRef out_sLine As String) As Boolean

    Dim bytValue            As Byte
    Dim abytLine()          As Byte
    Dim nStartOfLinePos     As Long
    Dim nEndOfLinePos       As Long
    Dim nStartOfNextLine    As Long
    Dim nLineLen            As Long

    ' Save the current file position as the beginning of the line,and cache this value.
    nStartOfLinePos = Seek(the_nFileNo)

    ' Retrieves the first byte from the current position.
    Get #the_nFileNo,bytValue

    ' Loop until the end of file is encountered.
    Do Until EOF(the_nFileNo)

        ' Check whether this byte represents a carriage return or line Feed character (indicating new line).
        If bytValue = 13 Or bytValue = 10 Then
            ' By this point,the current position is *after* the CR or LF character,so to get the position of the
            ' last byte in the line,we must go back two bytes.
            nEndOfLinePos = Seek(the_nFileNo) - 2

            ' If this is a carriage return,then we must check the next character.
            If bytValue = 13 Then
                Get #the_nFileNo,bytValue
                ' Is this a line Feed?
                If bytValue = 10 Then
                ' Yes. Assume that CR-LF counts as a single NewLine. So the start of the next line should skip over the line Feed.
                    nStartOfNextLine = nEndOfLinePos + 3
                Else
                ' No. The start of the next line is the current position.
                    nStartOfNextLine = nEndOfLinePos + 2
                End If
            ElseIf bytValue = 10 Then
            ' If this is a line Feed,then the start of the next line is the current position.
                nStartOfNextLine = nEndOfLinePos + 2
            End If

            ' Since we have processed all the bytes in the line,exit the loop.
            Exit Do
        End If

        ' Get the next byte.
        Get #the_nFileNo,bytValue
    Loop

    ' Check to see if there was an end of line.
    If nEndOfLinePos = 0 Then
    ' No,this is the end of the file - so use all the remaining characters.
        nLineLen = Seek(the_nFileNo) - nStartOfLinePos - 1
    Else
    ' Yes - so use all the characters up to the end of line position.
        nLineLen = nEndOfLinePos - nStartOfLinePos + 1
    End If

    ' Is this line empty?
    If nLineLen = 0 Then
    ' Yes - just return an empty string.
        out_sLine = vbNullString
    Else
    ' No - pull all the bytes from the beginning to the end of the line into a byte array,and then convert that from UTF-8 to a VB string.
        ReDim abytLine(1 To nLineLen)
        Get #the_nFileNo,nStartOfLinePos,abytLine()
        out_sLine = FromCPString(abytLine(),CP_UTF8)
    End If

    ' If there is a line afterwards,then move to the beginning of the line,and return True.
    If nStartOfNextLine > 0 Then
        Seek #the_nFileNo,nStartOfNextLine
        LineInputUTF8 = True
    End If

End Function

' Purpose:  Analogue of 'Open "fileName" For Input As #fileNo' - but also return what type of text this is via a Code Page value.
' Inputs:   the_sFileName
'           the_iFileNo
'           (the_nDesiredCodePage)  The code page that you want to use with this file.
'                                   If this value is set to the default,-1,this indicates that the code page will be ascertained from the file.
' Outputs:  out_nCodePage           There are only six valid values that are returned if <the_nDesiredCodePage> was set to -1.
'               CP_ACP        ANSI code page
'               CP_UTF8       UTF-8
'               CP_UTF16LE    UTF-16 Little Endian (VB and NT default string encoding)
'               CP_UTF16BE    UTF-16 Big Endian
'               CP_UTF32LE    UTF-32 Little Endian
'               CP_UTF32BE    UTF-32 Big Endian
'           (out_bContainedBOM)     If this was set to True,then the file started with a BOM (Byte Order Marker).
Public Sub OpenForInput(ByRef the_sFilename As String,ByVal the_iFileNo As Integer,Optional ByRef out_bContainedBOM As Boolean)

    ' Note if we want to take account of every case,we should read in the first four bytes,and check for UTF-32 low and high endian BOMs,check
    ' the first three bytes for the UTF-8 BOM,and finally check the first two bytes for UTF-16 low and hight endian BOMs.
    Dim abytBOM(1 To 4)             As Byte
    Dim nCodePage                   As Long

    ' By default,there is no BOM.
    out_bContainedBOM = False

    Open the_sFilename For Binary Access Read As #the_iFileNo

    ' We are interested in -1 (ascertain code page),and then varIoUs UTF encodings.
    Select Case the_nDesiredCodePage
    Case -1,CP_UTF8,CP_UTF16_BE,CP_UTF16_LE,CP_UTF32_BE,CP_UTF32_LE

        ' Default code page.
        nCodePage = CP_ACP

        ' Pull in the first four bytes to determine the BOM (byte order marker).
        Get #the_iFileNo,abytBOM()

        ' The following are the BOMs for text files:
        '
        ' FF FE         UTF-16,little endian
        ' FE FF         UTF-16,big endian
        ' EF BB BF      UTF-8
        ' FF FE 00 00   UTF-32,little endian
        ' 00 00 FE FF   UTF-32,big-endian
        '
        ' Work out the code page from this information.

        Select Case abytBOM(1)
        Case &HFF
            If abytBOM(2) = &HFE Then
                If abytBOM(3) = 0 And abytBOM(4) = 0 Then
                    nCodePage = CP_UTF32_LE
                Else
                    nCodePage = CP_UTF16_LE
                End If
            End If
        Case &HFE
            If abytBOM(2) = &HFF Then
                nCodePage = CP_UTF16_BE
            End If
        Case &HEF
            If abytBOM(2) = &HBB And abytBOM(3) = &HBF Then
                nCodePage = CP_UTF8
            End If
        Case &H0
            If abytBOM(2) = &H0 And abytBOM(3) = &HFE And abytBOM(4) = &HFF Then
                nCodePage = CP_UTF32_BE
            End If
        End Select

        ' Did we match any BOMs?
        If nCodePage = CP_ACP Then
        ' No - we are still defaulting to the ANSI code page.
            ' Special check for UTF-8. The BOM is not specified in the standard for UTF-8,but according to Wikipedia (which is always right :-) ),' only Microsoft includes this marker at the beginning of files.
            If FileBytesAreUTF8(the_iFileNo) Then
                out_nCodePage = CP_UTF8
            Else
                out_nCodePage = CP_ACP
            End If
        Else
        ' Yes - we have worked out the code page from the BOM.
            ' If no code page was suggested,we now return the code page we found.
            If the_nDesiredCodePage = -1 Then
                out_nCodePage = nCodePage
            End If

            ' Inform the caller that a BOM was found.
            out_bContainedBOM = True
        End If

        ' Reset the file pointer to the beginning of the file data.
        If out_bContainedBOM Then
            ' Note that if the code page found was one of the two UTF-32 values,then we are already in the correct position.
            ' Otherwise,we have to move to just after the end of the BOM.
            Select Case nCodePage
            Case CP_UTF16_BE,CP_UTF16_LE
                Seek #the_iFileNo,3
            Case CP_UTF8
                Seek #the_iFileNo,4
            End Select
        Else
            ' There is no BOM,so simply go the beginning of the file.
            Seek #the_iFileNo,1
        End If

    Case Else
        out_nCodePage = the_nDesiredCodePage
    End Select

End Sub

' Purpose:  Analogue of 'Open "fileName" For Append As #fileNo'
Public Sub OpenForAppend(ByRef the_sFilename As String,Optional ByVal the_nCodePage As Long = CP_ACP,Optional ByVal the_bPrefixWithBOM As Boolean = True)

    ' Open the file and move to the end of the file.
    Open the_sFilename For Binary Access Write As #the_iFileNo
    Seek the_iFileNo,LOF(the_iFileNo) + 1

    If the_bPrefixWithBOM Then
        WriteBOM the_iFileNo,the_nCodePage
    End If

End Sub

' Purpose:  Analogue of 'Open "fileName" For Output As #fileNo'
Public Sub OpenForOutput(ByRef the_sFilename As String,Optional ByVal the_bPrefixWithBOM As Boolean = True)

    ' Ensure we overwrite the file by deleting it ...
    On Error Resume Next
    Kill the_sFilename
    On Error GoTo 0

    ' ... before creating it.
    Open the_sFilename For Binary Access Write As #the_iFileNo

    If the_bPrefixWithBOM Then
        WriteBOM the_iFileNo,the_nCodePage
    End If

End Sub

' Purpose:  Analogue of the 'Print #fileNo,value' statement. But only one value allowed.
'           Setting <the_bAppendNewLine> = False is analagous to 'Print #fileNo,value;'.
Public Sub Print_(ByVal the_iFileNo As Integer,ByRef the_sValue As String,Optional ByVal the_bAppendNewLine As Boolean = True)

    Const kbytNull                  As Byte = 0
    Const kbytCarriageReturn        As Byte = 13
    Const kbytNewLine               As Byte = 10

    Put #the_iFileNo,ToCPString(the_sValue,the_nCodePage)

    If the_bAppendNewLine Then
        Select Case the_nCodePage
        Case CP_UTF16_BE
            Put #the_iFileNo,kbytNull
            Put #the_iFileNo,kbytCarriageReturn
            Put #the_iFileNo,kbytNewLine
        Case CP_UTF16_LE
            Put #the_iFileNo,kbytNewLine
            Put #the_iFileNo,kbytNull
        Case CP_UTF32_BE
            Put #the_iFileNo,kbytNewLine
        Case CP_UTF32_LE
            Put #the_iFileNo,kbytNull
        Case Else
            Put #the_iFileNo,kbytNewLine
        End Select
    End If

End Sub

Public Sub PutContents(ByRef the_sFilename As String,ByRef the_sFileContents As String,Optional the_bPrefixWithBOM As Boolean)

    Dim iFileNo                     As Integer

    iFileNo = FreeFile
    OpenForOutput the_sFilename,the_nCodePage,the_bPrefixWithBOM
    Print_ iFileNo,the_sFileContents,False
    Close iFileNo

End Sub

' Purpose:  Converts a VB string (UTF-16) to UTF8 - as a binary array.
Private Function ToCPString(ByRef the_sValue As String,ByVal the_nCodePage As Long) As Byte()

    Dim abytOutput()                As Byte
    Dim nValueLen                   As Long
    Dim nOutputByteLen              As Long

    If the_nCodePage = CP_UTF16_LE Then
        ToCPString = the_sValue
    Else

        ' Cache the input length.
        nValueLen = Len(the_sValue)

        ' See how big the output buffer will be.
        nOutputByteLen = WideCharToMultiByte(the_nCodePage,StrPtr(the_sValue),0&)

        If nOutputByteLen > 0 Then
            ' Resize output byte array to the size of the UTF-8 string.
            ReDim abytOutput(1 To nOutputByteLen)

            ' Make this API call again,this time giving a pointer to the output byte array.
            WideCharToMultiByte the_nCodePage,VarPtr(abytOutput(1)),nOutputByteLen,0&
        End If

        ' Return the array.
        ToCPString = abytOutput()

    End If

End Function

Private Sub WriteBOM(ByVal the_iFileNo As Integer,ByVal the_nCodePage As Long)

    ' FF FE         UTF-16,little endian
    ' FE FF         UTF-16,big endian
    ' EF BB BF      UTF-8
    ' FF FE 00 00   UTF-32,little endian
    ' 00 00 FE FF   UTF-32,big-endian

    Select Case the_nCodePage
    Case CP_UTF8
        Put #the_iFileNo,CByte(&HEF)
        Put #the_iFileNo,CByte(&HBB)
        Put #the_iFileNo,CByte(&HBF)
    Case CP_UTF16_LE
        Put #the_iFileNo,CByte(&HFF)
        Put #the_iFileNo,CByte(&HFE)
    Case CP_UTF16_BE
        Put #the_iFileNo,CByte(&HFE)
        Put #the_iFileNo,CByte(&HFF)
    Case CP_UTF32_LE
        Put #the_iFileNo,CByte(&H0)
        Put #the_iFileNo,CByte(&H0)
    Case CP_UTF32_BE
        Put #the_iFileNo,CByte(&HFF)
    End Select

End Sub

以下代码添加到具有带Lucida Console字体的VSFlexGrid控件的Form中 – 纯粹是为了提供一种显示尽可能多的字符的方法

Option Explicit

Private Sub Command_Click()
    Example_ConvertFileToUTF8
End Sub

Private Sub Command2_Click()
    Example_IterateUTF8Lines
End Sub

Private Sub Command3_Click()
    Example_ReadWriteUTF8Lines
End Sub

Private Sub Form_Load()
    VSFlexGrid.ColWidth(0) = 7000!
End Sub

' Purpose:  Converts *any* pure text file (UTF16,ASCII,ANSI) to UTF8.
Private Sub Example_ConvertFileToUTF8()

    Dim nCodePage                   As Long
    Dim bContainedBOM               As Boolean
    Dim sFileContents               As String

    ' Read in contents.
    sFileContents = TextFile.GetContents("C:\MysteryEncoding.txt",nCodePage,bContainedBOM)

    ' And then convert to UTF8.
    TextFile.PutContents "C:\output.txt",sFileContents,bContainedBOM

End Sub

' Purpose:  Iterates through each line of a UTF-8 text file,and adds it to a control which can display VB strings containing non-ANSI characters.
'           In this case,I am adding items to a FlexGrid with Font = "Lucida Console".
Private Sub Example_IterateUTF8Lines()

    Dim iFileNo             As Integer
    Dim lCodePage           As Long
    Dim sLine               As String

    iFileNo = FreeFile

    TextFile.OpenForInput "C:\UTF8.txt",lCodePage

    If lCodePage = CP_UTF8 Then
        Do While TextFile.LineInputUTF8(iFileNo,sLine)
            VSFlexGrid.AddItem sLine
        Loop
        VSFlexGrid.AddItem sLine
    Else
        MsgBox "This is not a UTF8 file."
    End If

    Close #iFileNo

End Sub

Private Sub Example_ReadWriteUTF8Lines()

    Dim iFileNoInput        As Integer
    Dim iFileNoOutput       As Integer
    Dim lCodePage           As Long
    Dim bBOM                As Boolean
    Dim sLine               As String

    iFileNoInput = FreeFile
    TextFile.OpenForInput "C:\UTF8.txt",iFileNoInput,lCodePage,bBOM

    If lCodePage = CP_UTF8 Then

        iFileNoOutput = FreeFile
        TextFile.OpenForOutput "C:\output.txt",iFileNoOutput,bBOM

        Do While TextFile.LineInputUTF8(iFileNoInput,sLine)
            TextFile.Print_ iFileNoOutput,sLine,lCodePage
        Loop
        TextFile.Print_ iFileNoOutput,False

        Close #iFileNoOutput

    Else
        MsgBox "This is not a UTF8 file."
    End If

    Close #iFileNoInput

End Sub
原文链接:https://www.f2er.com/vb/255838.html

猜你在找的VB相关文章