MhtBuilder_demo.zip

MhtBuilder_demo.zip


Zip contents | Download all

Imports System.Text
Imports System.Text.RegularExpressions
Imports System.IO

''' <summary>
''' This class builds the following from a URL:
'''
'''   .mht file (Web Archive, single file)
'''   .htm file with dereferenced (absolute) references (Web Page, HTML Only)
'''   .htm file plus all referenced files in a local subfolder (Web Page, complete) 
'''   .txt file (non-HTML contents of Web Page)
'''
''' The .mht format is based on RFC2557 
'''    "compliant Multipart MIME Message (mhtml web archive)"
'''    http://www.ietf.org/rfc/rfc2557.txt
''' </summary>
''' <remarks>
'''   Jeff Atwood
'''   http://www.codinghorror.com/
''' </remarks>
Public Class Builder

    Private _MhtBuilder As StringBuilder
    Private _StripScriptFromHtml As Boolean = False
    Private _StripIframeFromHtml As Boolean = False
    Private _AllowRecursion As Boolean = True
    Private _AddWebMark As Boolean = True
    Private _ForcedEncoding As System.Text.Encoding = Nothing

    Private _HtmlFile As WebFile

    Friend WebFiles As New SortedList
    Friend WebClient As New WebClientEx

    Private Const _MimeBoundaryTag As String = "----=_NextPart_000_00"

    Public Enum FileStorage
        Memory
        DiskPermanent
        DiskTemporary
    End Enum

    Public Sub New()
        _HtmlFile = New WebFile(Me, FileStorage.Memory)
    End Sub

#Region "  Properties"

    ''' <summary>
    ''' Specifies the target Url we want to save
    ''' </summary>
    Public Property Url() As String
        Get
            Return _HtmlFile.Url
        End Get
        Set(ByVal Value As String)
            WebFiles.Clear()
            _HtmlFile.Url = Value
        End Set
    End Property

    ''' <summary>
    ''' returns the Mime content-type string designation of a mht file
    ''' </summary>
    Public ReadOnly Property MhtContentType() As String
        Get
            Return "message/rfc822"
        End Get
    End Property

    ''' <summary>
    ''' *only* set this if you want to FORCE a specific text encoding for all the HTML pages you're downloading;
    ''' otherwise the text encoding is autodetected, which is generally what you want
    ''' </summary>
    Public Property TextEncoding() As System.Text.Encoding
        Get
            Return _ForcedEncoding
        End Get
        Set(ByVal Value As System.Text.Encoding)
            _ForcedEncoding = Value
        End Set
    End Property

    ''' <summary>
    ''' Add the "Mark of the web" to retrieved HTML content so it can run 
    ''' locally on Windows XP SP2
    ''' </summary>
    ''' <remarks>
    '''   http://www.microsoft.com/technet/prodtechnol/winxppro/maintain/sp2brows.mspx#XSLTsection133121120120
    ''' </remarks>
    Public Property AddWebMark() As Boolean
        Get
            Return _AddWebMark
        End Get
        Set(ByVal Value As Boolean)
            _AddWebMark = Value
        End Set
    End Property

    ''' <summary>
    ''' Strip all &lt;SCRIPT&gt; blocks from any retrieved HTML
    ''' </summary>
    Public Property StripScripts() As Boolean
        Get
            Return _StripScriptFromHtml
        End Get
        Set(ByVal Value As Boolean)
            _StripScriptFromHtml = Value
        End Set
    End Property

    ''' <summary>
    ''' Strip all &lt;IFRAME&gt; blocks from any retrieved HTML
    ''' </summary>
    Public Property StripIframes() As Boolean
        Get
            Return _StripIframeFromHtml
        End Get
        Set(ByVal Value As Boolean)
            _StripIframeFromHtml = Value
        End Set
    End Property

    ''' <summary>
    ''' The browser identification string that is sent in all HTTP requests;
    ''' using a different string can produce simplified (downlevel) HTML
    ''' </summary>
    ''' <remarks>
    ''' defaults to browser ID string of vanilla IE6 as seen in XP SP2
    ''' </remarks>
    Public Property BrowserIdString() As String
        Get
            Return WebClient.BrowserIdString
        End Get
        Set(ByVal Value As String)
            WebClient.BrowserIdString = Value
        End Set
    End Property

    ''' <summary>
    ''' the target URL requires authentication
    ''' if not provided, the current user's credentials will automatically be sent
    ''' </summary>
    Public Property AuthenticationRequired() As Boolean
        Get
            Return WebClient.AuthenticationRequired
        End Get
        Set(ByVal Value As Boolean)
            WebClient.AuthenticationRequired = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP Authentication user for Url
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property AuthenticationUser() As String
        Get
            Return WebClient.AuthenticationUser
        End Get
        Set(ByVal Value As String)
            WebClient.AuthenticationUser = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP Authentication password for Url
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property AuthenticationPassword() As String
        Get
            Return WebClient.AuthenticationPassword
        End Get
        Set(ByVal Value As String)
            WebClient.AuthenticationPassword = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP proxy username
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property ProxyUser() As String
        Get
            Return WebClient.ProxyUser
        End Get
        Set(ByVal Value As String)
            WebClient.ProxyUser = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP proxy password
    ''' if left blank, the current user's credentials will be sent
    ''' </summary>
    Public Property ProxyPassword() As String
        Get
            Return WebClient.ProxyPassword
        End Get
        Set(ByVal Value As String)
            WebClient.ProxyPassword = Value
        End Set
    End Property

    ''' <summary>
    ''' HTTP proxy URL
    ''' if provided, proxy will always be used; if left blank, proxy will not be used
    ''' </summary>
    Public Property ProxyUrl() As String
        Get
            Return WebClient.ProxyUrl
        End Get
        Set(ByVal Value As String)
            WebClient.ProxyUrl = Value
        End Set
    End Property

    ''' <summary>
    ''' Proxy requires authentication
    ''' if not provided, the current user's credentials will automatically be sent
    ''' </summary>
    Public Property ProxyAuthenticationRequired() As Boolean
        Get
            Return WebClient.ProxyAuthenticationRequired
        End Get
        Set(ByVal Value As Boolean)
            WebClient.ProxyAuthenticationRequired = Value
        End Set
    End Property

    ''' <summary>
    ''' allow recursive retrieval of any embedded HTML (typically IFRAME or FRAME)
    ''' turn off to prevent infinite recursion in the case of pages that reference themselves..
    ''' </summary>
    Public Property AllowRecursiveFileRetrieval() As Boolean
        Get
            Return _AllowRecursion
        End Get
        Set(ByVal Value As Boolean)
            _AllowRecursion = Value
        End Set
    End Property

#End Region

#Region "  Public"

    ''' <summary>
    ''' Saves URL to disk as a single HTML file, modified with absolute external references
    ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
    ''' </summary>
    ''' <param name="outputFilePath">path to generate to, or filename to generate</param>
    ''' <param name="url">fully qualified URL you wish to save</param>
    ''' <returns>the complete path of the HTML file that was saved to disk</returns>
    Public Function SavePage(ByVal outputFilePath As String, Optional ByVal url As String = "") As String
        ValidateFilename(outputFilePath, ".htm;.html")
        DownloadHtmlFile(url)
        _HtmlFile.UseHtmlTitleAsFilename = True
        _HtmlFile.DownloadPath = outputFilePath
        _HtmlFile.SaveToFile()
        Return _HtmlFile.DownloadPath
    End Function

    ''' <summary>
    ''' Saves URL to disk as a plain text file, stripping all HTML from it
    ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
    ''' </summary>
    ''' <param name="outputFilePath">path to generate to, or filename to generate</param>
    ''' <param name="url">fully qualified URL you wish to save</param>
    ''' <returns>the complete path of the text file that was saved to disk</returns>
    Public Function SavePageText(ByVal outputFilePath As String, Optional ByVal url As String = "") As String
        ValidateFilename(outputFilePath, ".txt")
        DownloadHtmlFile(url)
        _HtmlFile.UseHtmlTitleAsFilename = True
        _HtmlFile.DownloadPath = outputFilePath
        _HtmlFile.SaveAsTextFile()
        Return Path.ChangeExtension(_HtmlFile.DownloadPath, ".txt")
    End Function

    ''' <summary>
    ''' Saves URL to disk as multiple files: a single HTML file, modified with local references
    ''' to externally referenced files in a subfolder
    ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
    ''' </summary>
    ''' <param name="outputFilePath">path to generate to, or filename to generate</param>
    ''' <param name="url">fully qualified URL you wish to save</param>
    ''' <returns>the complete path of the HTML file that was saved to disk</returns>
    Public Function SavePageComplete(ByVal outputFilePath As String, Optional ByVal url As String = "") As String
        ValidateFilename(outputFilePath, ".htm;.html")
        DownloadHtmlFile(url)

        '-- first, let's get all the external files
        _HtmlFile.DownloadPath = outputFilePath
        _HtmlFile.UseHtmlTitleAsFilename = True
        _HtmlFile.DownloadExternalFiles(FileStorage.DiskPermanent, _AllowRecursion)

        '-- convert any references in external files
        For Each de As DictionaryEntry In WebFiles
            Dim ef As WebFile = DirectCast(de.Value, WebFile)
            If ef.IsHtml Or ef.IsCss Then
                ef.ConvertReferencesToLocal()
                ef.SaveToFile()
            End If
        Next

        '-- convert the main HTML references
        _HtmlFile.ConvertReferencesToLocal()
        _HtmlFile.SaveToFile()

        Return _HtmlFile.DownloadPath
    End Function

    ''' <summary>
    ''' Generates a string representation of the URL as a Mht archive file
    ''' using exclusively in-memory storage
    ''' </summary>
    ''' <param name="url">fully qualified URL you wish to render to Mht</param>
    ''' <returns>string representation of MHT file</returns>
    Public Function GetPageArchive(Optional ByVal url As String = "") As String
        DownloadHtmlFile(url)
        _HtmlFile.DownloadExternalFiles(FileStorage.Memory, _AllowRecursion)
        AppendMhtHeader(_HtmlFile)
        AppendMhtFiles()
        Return FinalizeMht()
    End Function

    ''' <summary>
    ''' Saves URL to disk as a single file Mht archive
    ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file
    ''' </summary>
    ''' <param name="outputFilePath">path to generate to, or filename to generate</param>
    ''' <param name="st">type of storage to use when generating the Mht archive</param>
    ''' <param name="url">fully qualified URL you wish to save as Mht</param>
    ''' <returns>the complete path of the Mht archive file that was generated</returns>
    Public Function SavePageArchive(ByVal outputFilePath As String, ByVal st As FileStorage, Optional ByVal url As String = "") As String
        ValidateFilename(outputFilePath, ".mht")
        DownloadHtmlFile(url)

        _HtmlFile.DownloadPath = outputFilePath
        _HtmlFile.UseHtmlTitleAsFilename = True

        '-- if set to permanent disk storage, make a local copy of the HTML
        If st = FileStorage.DiskPermanent Then
            _HtmlFile.SaveToFile(Path.ChangeExtension(_HtmlFile.DownloadPath, ".htm"))
        End If

        '-- download all references
        _HtmlFile.DownloadExternalFiles(st, _AllowRecursion)

        '-- build the Mht 
        AppendMhtHeader(_HtmlFile)
        AppendMhtFiles()
        FinalizeMht(Path.ChangeExtension(_HtmlFile.DownloadPath, ".mht"))

        '-- possibly destroy temporary resources
        If st = FileStorage.DiskTemporary Then
            For Each de As DictionaryEntry In WebFiles
                Dim ef As WebFile = DirectCast(de.Value, WebFile)
                If ef.Storage = FileStorage.DiskTemporary Then
                    File.Delete(ef.DownloadPath)
                End If
                '-- if the temp folder is empty, kill that too
                If Directory.GetFileSystemEntries(ef.DownloadFolder).Length = 0 Then
                    Directory.Delete(ef.DownloadFolder)
                End If
            Next
        End If
        WebFiles.Clear()

        Return Path.ChangeExtension(_HtmlFile.DownloadPath, ".mht")
    End Function

#End Region

#Region "  Private"

    ''' <summary>
    ''' returns the root HTML we'll use to generate everything else;
    ''' this is tracked in the _HtmlFile object, which is always FileStorage.Memory
    ''' </summary>
    Private Sub DownloadHtmlFile(ByVal url As String)
        If url <> "" Then
            Me.Url = url
        End If
        _HtmlFile.Storage = FileStorage.Memory
        _HtmlFile.WasAppended = False
        _HtmlFile.Download()
        If Not _HtmlFile.WasDownloaded Then
            Throw New Exception("unable to download '" & Me.Url & "': " & _
                _HtmlFile.DownloadException.Message, _HtmlFile.DownloadException)
        End If
    End Sub

    ''' <summary>
    ''' returns true if this path refers to a directory (vs. a filename)
    ''' </summary>
    Private Function IsDirectory(ByVal FilePath As String) As Boolean
        Return FilePath.EndsWith("\")
    End Function


    ''' <summary>
    ''' ensures that the path, if it contains a filename, matches one of the semicolon delimited 
    ''' filetypes provided in fileExtension
    ''' </summary>
    Private Sub ValidateFilename(ByVal FilePath As String, ByVal fileExtensions As String)
        If IsDirectory(FilePath) Then Return
        Dim ext As String = Path.GetExtension(FilePath)
        If ext = "" Then
            Throw New Exception("The filename provided, '" & Path.GetFileName(FilePath) & _
                "', has no extension. If are specifying a folder, make sure it ends in a trailing slash. " & _
                "The expected file extension(s) are '" & fileExtensions & "'")
        End If
        If Not Regex.IsMatch(fileExtensions, ext & "(;|$)", RegexOptions.IgnoreCase) Then
            Throw New Exception("The extension of the filename provided, '" & Path.GetFileName(FilePath) & _
                "', does not have the expected extension(s) '" & fileExtensions & "'")
        End If
    End Sub

    ''' <summary>
    ''' removes all unsafe filesystem characters to form a valid filesystem filename
    ''' </summary>
    Private Function MakeValidFilename(ByVal s As String) As String
        '-- replace any invalid filesystem chars with underscore
        Return Regex.Replace(s, "[\/\\\:\*\?\""\<\>\|]", "_")
    End Function

    ''' <summary>
    ''' appends all downloaded files (from _ExternalFiles) to our MhtBuilder
    ''' </summary>
    ''' <param name="st">type of storage to use when downloading external files</param>
    ''' <param name="storagePath">path to use for downloaded external files</param>
    Private Sub AppendMhtFiles()
        For Each de As DictionaryEntry In WebFiles
            Dim ef As WebFile = DirectCast(de.Value, WebFile)
            AppendMhtFile(ef)
        Next
        AppendMhtBoundary()
    End Sub

    ''' <summary>
    ''' appends the Mht header, which includes the root HTML
    ''' </summary>
    Private Sub AppendMhtHeader(ByVal ef As WebFile)
        '-- clear the stringbuilder contents
        _MhtBuilder = New StringBuilder

        AppendMhtLine("From: <Saved by " & Environment.UserName & " on " & Environment.MachineName & ">")
        AppendMhtLine("Subject: " & ef.HtmlTitle)
        AppendMhtLine("Date: " & DateTime.Now.ToString("ddd, dd MMM yyyy HH:mm:ss zzz"))
        AppendMhtLine("MIME-Version: 1.0")
        AppendMhtLine("Content-Type: multipart/related;")
        AppendMhtLine(Convert.ToChar(9) & "type=""text/html"";")
        AppendMhtLine(Convert.ToChar(9) & "boundary=""" & _MimeBoundaryTag & """")
        AppendMhtLine("X-MimeOLE: Produced by " & Me.GetType.ToString & " " & _
            Reflection.Assembly.GetExecutingAssembly.GetName.Version.ToString())
        AppendMhtLine()
        AppendMhtLine("This is a multi-part message in MIME format.")

        AppendMhtFile(ef)
    End Sub

    ''' <summary>
    ''' append a single line, with trailing CRLF, to our MhtBuilder
    ''' </summary>
    Private Sub AppendMhtLine(Optional ByVal s As String = "")
        _MhtBuilder.Append(s)
        _MhtBuilder.Append(Environment.NewLine)
    End Sub

    ''' <summary>
    ''' appends a boundary marker to our MhtBuilder
    ''' </summary>
    Private Sub AppendMhtBoundary()
        AppendMhtLine()
        AppendMhtLine("--" & _MimeBoundaryTag)
    End Sub

    ''' <summary>
    ''' Appends a downloaded external file to our MhtBuilder
    ''' </summary>
    Private Sub AppendMhtFile(ByVal ef As WebFile)
        If ef.WasDownloaded And Not ef.WasAppended Then
            If ef.IsBinary Then
                AppendMhtBinaryFile(ef)
            Else
                AppendMhtTextFile(ef)
            End If
        End If
        ef.WasAppended = True
    End Sub

    ''' <summary>
    ''' Appends a downloaded external text file to our MhtBuilder using Quoted-Printable encoding
    ''' </summary>
    Private Sub AppendMhtTextFile(ByVal ef As WebFile)
        AppendMhtBoundary()
        AppendMhtLine("Content-Type: " & ef.ContentType & ";")
        AppendMhtLine(Convert.ToChar(9) & "charset=""" & ef.TextEncoding.WebName & """")
        AppendMhtLine("Content-Transfer-Encoding: quoted-printable")
        AppendMhtLine("Content-Location: " & ef.Url)
        AppendMhtLine()
        AppendMhtLine(QuotedPrintableEncode(ef.ToString, ef.TextEncoding))
    End Sub

    ''' <summary>
    ''' Appends a downloaded external binary file to our MhtBuilder using Base64 encoding
    ''' </summary>
    Private Sub AppendMhtBinaryFile(ByVal ef As WebFile)
        AppendMhtBoundary()
        AppendMhtLine("Content-Type: " & ef.ContentType)
        AppendMhtLine("Content-Transfer-Encoding: base64")
        AppendMhtLine("Content-Location: " & ef.Url)
        AppendMhtLine()

        '-- note that chunk size is equal to maximum line width (expanded = 75 chars)
        Const ChunkSize As Integer = 57

        If ef.Storage = FileStorage.Memory Then
            Dim len As Integer = ef.DownloadedBytes.Length
            If len <= ChunkSize Then
                AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, 0, len))
            Else
                Dim i As Integer = 0
                Do While i + ChunkSize < len
                    AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, i, ChunkSize))
                    i += ChunkSize
                Loop
                If i <> len Then
                    AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, i, len - i))
                End If
            End If
        Else
            Dim fs As IO.FileStream
            Dim b(ChunkSize) As Byte
            Dim BytesRead As Integer
            Try
                fs = New FileStream(ef.DownloadPath, FileMode.Open, FileAccess.Read)
                BytesRead = fs.Read(b, 0, ChunkSize)
                Do While BytesRead > 0
                    AppendMhtLine(Convert.ToBase64String(b, 0, BytesRead))
                    BytesRead = fs.Read(b, 0, ChunkSize)
                Loop
            Finally
                If Not fs Is Nothing Then
                    fs.Close()
                End If
            End Try
        End If
    End Sub

    ''' <summary>
    ''' dumps our MhtBuilder to disk and clears it
    ''' </summary>
    Private Sub FinalizeMht(ByVal outputFilePath As String)
        Dim sr As New StreamWriter(outputFilePath, False, _HtmlFile.TextEncoding)
        sr.Write(_MhtBuilder.ToString)
        sr.Close()
        _MhtBuilder = Nothing
    End Sub

    ''' <summary>
    ''' dumps our MhtBuilder as a string and clears it
    ''' </summary>
    Private Function FinalizeMht() As String
        Dim s As String = _MhtBuilder.ToString
        _MhtBuilder = Nothing
        Return s
    End Function

#End Region

#Region "  Quoted-Printable encoding"

    ''' <summary>
    ''' converts a string into Quoted-Printable encoding
    '''   http://www.freesoft.org/CIE/RFC/1521/6.htm
    ''' </summary>
    Private Function QuotedPrintableEncode(ByVal s As String, ByVal e As System.Text.Encoding) As String
        Dim Ascii As Integer
        Dim LastSpace As Integer = 0
        Dim LineLength As Integer = 0
        Dim LineBreaks As Integer = 0
        Dim sb As New StringBuilder
        Dim longchar As String

        If s Is Nothing OrElse s.Length = 0 Then
            Return ""
        End If

        For Each c As Char In s

            Ascii = Convert.ToInt32(c)

            If Ascii = 61 Or Ascii > 126 Then
                If Ascii <= 255 Then
                    sb.Append("=")
                    sb.Append(Convert.ToString(Ascii, 16).ToUpper)
                    LineLength += 3
                Else
                    '-- double-byte land..
                    For Each b As Byte In e.GetBytes(c)
                        sb.Append("=")
                        sb.Append(Convert.ToString(b, 16).ToUpper)
                        LineLength += 3
                    Next
                End If
            Else
                sb.Append(c)
                LineLength += 1
                If Ascii = 32 Then LastSpace = sb.Length
            End If

            If LineLength >= 73 Then
                If LastSpace = 0 Then
                    sb.Insert(sb.Length, "=" & Environment.NewLine)
                    LineLength = 0
                Else
                    sb.Insert(LastSpace, "=" & Environment.NewLine)
                    LineLength = sb.Length - LastSpace - 1
                End If
                LineBreaks += 1
                LastSpace = 0
            End If

        Next

        '-- if we split the line, have to indicate trailing spaces
        If LineBreaks > 0 Then
            If sb.Chars(sb.Length - 1) = " " Then
                sb.Remove(sb.Length - 1, 1)
                sb.Append("=20")
            End If
        End If

        Return sb.ToString
    End Function

#End Region


End Class

bytes=23544