Imports System.Text Imports System.Text.RegularExpressions Imports System.IO ''' <summary> ''' This class builds the following from a URL: ''' ''' .mht file (Web Archive, single file) ''' .htm file with dereferenced (absolute) references (Web Page, HTML Only) ''' .htm file plus all referenced files in a local subfolder (Web Page, complete) ''' .txt file (non-HTML contents of Web Page) ''' ''' The .mht format is based on RFC2557 ''' "compliant Multipart MIME Message (mhtml web archive)" ''' http://www.ietf.org/rfc/rfc2557.txt ''' </summary> ''' <remarks> ''' Jeff Atwood ''' http://www.codinghorror.com/ ''' </remarks> Public Class Builder Private _MhtBuilder As StringBuilder Private _StripScriptFromHtml As Boolean = False Private _StripIframeFromHtml As Boolean = False Private _AllowRecursion As Boolean = True Private _AddWebMark As Boolean = True Private _ForcedEncoding As System.Text.Encoding = Nothing Private _HtmlFile As WebFile Friend WebFiles As New SortedList Friend WebClient As New WebClientEx Private Const _MimeBoundaryTag As String = "----=_NextPart_000_00" Public Enum FileStorage Memory DiskPermanent DiskTemporary End Enum Public Sub New() _HtmlFile = New WebFile(Me, FileStorage.Memory) End Sub #Region " Properties" ''' <summary> ''' Specifies the target Url we want to save ''' </summary> Public Property Url() As String Get Return _HtmlFile.Url End Get Set(ByVal Value As String) WebFiles.Clear() _HtmlFile.Url = Value End Set End Property ''' <summary> ''' returns the Mime content-type string designation of a mht file ''' </summary> Public ReadOnly Property MhtContentType() As String Get Return "message/rfc822" End Get End Property ''' <summary> ''' *only* set this if you want to FORCE a specific text encoding for all the HTML pages you're downloading; ''' otherwise the text encoding is autodetected, which is generally what you want ''' </summary> Public Property TextEncoding() As System.Text.Encoding Get Return _ForcedEncoding End Get Set(ByVal Value As System.Text.Encoding) _ForcedEncoding = Value End Set End Property ''' <summary> ''' Add the "Mark of the web" to retrieved HTML content so it can run ''' locally on Windows XP SP2 ''' </summary> ''' <remarks> ''' http://www.microsoft.com/technet/prodtechnol/winxppro/maintain/sp2brows.mspx#XSLTsection133121120120 ''' </remarks> Public Property AddWebMark() As Boolean Get Return _AddWebMark End Get Set(ByVal Value As Boolean) _AddWebMark = Value End Set End Property ''' <summary> ''' Strip all <SCRIPT> blocks from any retrieved HTML ''' </summary> Public Property StripScripts() As Boolean Get Return _StripScriptFromHtml End Get Set(ByVal Value As Boolean) _StripScriptFromHtml = Value End Set End Property ''' <summary> ''' Strip all <IFRAME> blocks from any retrieved HTML ''' </summary> Public Property StripIframes() As Boolean Get Return _StripIframeFromHtml End Get Set(ByVal Value As Boolean) _StripIframeFromHtml = Value End Set End Property ''' <summary> ''' The browser identification string that is sent in all HTTP requests; ''' using a different string can produce simplified (downlevel) HTML ''' </summary> ''' <remarks> ''' defaults to browser ID string of vanilla IE6 as seen in XP SP2 ''' </remarks> Public Property BrowserIdString() As String Get Return WebClient.BrowserIdString End Get Set(ByVal Value As String) WebClient.BrowserIdString = Value End Set End Property ''' <summary> ''' the target URL requires authentication ''' if not provided, the current user's credentials will automatically be sent ''' </summary> Public Property AuthenticationRequired() As Boolean Get Return WebClient.AuthenticationRequired End Get Set(ByVal Value As Boolean) WebClient.AuthenticationRequired = Value End Set End Property ''' <summary> ''' HTTP Authentication user for Url ''' if left blank, the current user's credentials will be sent ''' </summary> Public Property AuthenticationUser() As String Get Return WebClient.AuthenticationUser End Get Set(ByVal Value As String) WebClient.AuthenticationUser = Value End Set End Property ''' <summary> ''' HTTP Authentication password for Url ''' if left blank, the current user's credentials will be sent ''' </summary> Public Property AuthenticationPassword() As String Get Return WebClient.AuthenticationPassword End Get Set(ByVal Value As String) WebClient.AuthenticationPassword = Value End Set End Property ''' <summary> ''' HTTP proxy username ''' if left blank, the current user's credentials will be sent ''' </summary> Public Property ProxyUser() As String Get Return WebClient.ProxyUser End Get Set(ByVal Value As String) WebClient.ProxyUser = Value End Set End Property ''' <summary> ''' HTTP proxy password ''' if left blank, the current user's credentials will be sent ''' </summary> Public Property ProxyPassword() As String Get Return WebClient.ProxyPassword End Get Set(ByVal Value As String) WebClient.ProxyPassword = Value End Set End Property ''' <summary> ''' HTTP proxy URL ''' if provided, proxy will always be used; if left blank, proxy will not be used ''' </summary> Public Property ProxyUrl() As String Get Return WebClient.ProxyUrl End Get Set(ByVal Value As String) WebClient.ProxyUrl = Value End Set End Property ''' <summary> ''' Proxy requires authentication ''' if not provided, the current user's credentials will automatically be sent ''' </summary> Public Property ProxyAuthenticationRequired() As Boolean Get Return WebClient.ProxyAuthenticationRequired End Get Set(ByVal Value As Boolean) WebClient.ProxyAuthenticationRequired = Value End Set End Property ''' <summary> ''' allow recursive retrieval of any embedded HTML (typically IFRAME or FRAME) ''' turn off to prevent infinite recursion in the case of pages that reference themselves.. ''' </summary> Public Property AllowRecursiveFileRetrieval() As Boolean Get Return _AllowRecursion End Get Set(ByVal Value As Boolean) _AllowRecursion = Value End Set End Property #End Region #Region " Public" ''' <summary> ''' Saves URL to disk as a single HTML file, modified with absolute external references ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file ''' </summary> ''' <param name="outputFilePath">path to generate to, or filename to generate</param> ''' <param name="url">fully qualified URL you wish to save</param> ''' <returns>the complete path of the HTML file that was saved to disk</returns> Public Function SavePage(ByVal outputFilePath As String, Optional ByVal url As String = "") As String ValidateFilename(outputFilePath, ".htm;.html") DownloadHtmlFile(url) _HtmlFile.UseHtmlTitleAsFilename = True _HtmlFile.DownloadPath = outputFilePath _HtmlFile.SaveToFile() Return _HtmlFile.DownloadPath End Function ''' <summary> ''' Saves URL to disk as a plain text file, stripping all HTML from it ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file ''' </summary> ''' <param name="outputFilePath">path to generate to, or filename to generate</param> ''' <param name="url">fully qualified URL you wish to save</param> ''' <returns>the complete path of the text file that was saved to disk</returns> Public Function SavePageText(ByVal outputFilePath As String, Optional ByVal url As String = "") As String ValidateFilename(outputFilePath, ".txt") DownloadHtmlFile(url) _HtmlFile.UseHtmlTitleAsFilename = True _HtmlFile.DownloadPath = outputFilePath _HtmlFile.SaveAsTextFile() Return Path.ChangeExtension(_HtmlFile.DownloadPath, ".txt") End Function ''' <summary> ''' Saves URL to disk as multiple files: a single HTML file, modified with local references ''' to externally referenced files in a subfolder ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file ''' </summary> ''' <param name="outputFilePath">path to generate to, or filename to generate</param> ''' <param name="url">fully qualified URL you wish to save</param> ''' <returns>the complete path of the HTML file that was saved to disk</returns> Public Function SavePageComplete(ByVal outputFilePath As String, Optional ByVal url As String = "") As String ValidateFilename(outputFilePath, ".htm;.html") DownloadHtmlFile(url) '-- first, let's get all the external files _HtmlFile.DownloadPath = outputFilePath _HtmlFile.UseHtmlTitleAsFilename = True _HtmlFile.DownloadExternalFiles(FileStorage.DiskPermanent, _AllowRecursion) '-- convert any references in external files For Each de As DictionaryEntry In WebFiles Dim ef As WebFile = DirectCast(de.Value, WebFile) If ef.IsHtml Or ef.IsCss Then ef.ConvertReferencesToLocal() ef.SaveToFile() End If Next '-- convert the main HTML references _HtmlFile.ConvertReferencesToLocal() _HtmlFile.SaveToFile() Return _HtmlFile.DownloadPath End Function ''' <summary> ''' Generates a string representation of the URL as a Mht archive file ''' using exclusively in-memory storage ''' </summary> ''' <param name="url">fully qualified URL you wish to render to Mht</param> ''' <returns>string representation of MHT file</returns> Public Function GetPageArchive(Optional ByVal url As String = "") As String DownloadHtmlFile(url) _HtmlFile.DownloadExternalFiles(FileStorage.Memory, _AllowRecursion) AppendMhtHeader(_HtmlFile) AppendMhtFiles() Return FinalizeMht() End Function ''' <summary> ''' Saves URL to disk as a single file Mht archive ''' if a folder is provided instead of a filename, the TITLE tag is used to name the file ''' </summary> ''' <param name="outputFilePath">path to generate to, or filename to generate</param> ''' <param name="st">type of storage to use when generating the Mht archive</param> ''' <param name="url">fully qualified URL you wish to save as Mht</param> ''' <returns>the complete path of the Mht archive file that was generated</returns> Public Function SavePageArchive(ByVal outputFilePath As String, ByVal st As FileStorage, Optional ByVal url As String = "") As String ValidateFilename(outputFilePath, ".mht") DownloadHtmlFile(url) _HtmlFile.DownloadPath = outputFilePath _HtmlFile.UseHtmlTitleAsFilename = True '-- if set to permanent disk storage, make a local copy of the HTML If st = FileStorage.DiskPermanent Then _HtmlFile.SaveToFile(Path.ChangeExtension(_HtmlFile.DownloadPath, ".htm")) End If '-- download all references _HtmlFile.DownloadExternalFiles(st, _AllowRecursion) '-- build the Mht AppendMhtHeader(_HtmlFile) AppendMhtFiles() FinalizeMht(Path.ChangeExtension(_HtmlFile.DownloadPath, ".mht")) '-- possibly destroy temporary resources If st = FileStorage.DiskTemporary Then For Each de As DictionaryEntry In WebFiles Dim ef As WebFile = DirectCast(de.Value, WebFile) If ef.Storage = FileStorage.DiskTemporary Then File.Delete(ef.DownloadPath) End If '-- if the temp folder is empty, kill that too If Directory.GetFileSystemEntries(ef.DownloadFolder).Length = 0 Then Directory.Delete(ef.DownloadFolder) End If Next End If WebFiles.Clear() Return Path.ChangeExtension(_HtmlFile.DownloadPath, ".mht") End Function #End Region #Region " Private" ''' <summary> ''' returns the root HTML we'll use to generate everything else; ''' this is tracked in the _HtmlFile object, which is always FileStorage.Memory ''' </summary> Private Sub DownloadHtmlFile(ByVal url As String) If url <> "" Then Me.Url = url End If _HtmlFile.Storage = FileStorage.Memory _HtmlFile.WasAppended = False _HtmlFile.Download() If Not _HtmlFile.WasDownloaded Then Throw New Exception("unable to download '" & Me.Url & "': " & _ _HtmlFile.DownloadException.Message, _HtmlFile.DownloadException) End If End Sub ''' <summary> ''' returns true if this path refers to a directory (vs. a filename) ''' </summary> Private Function IsDirectory(ByVal FilePath As String) As Boolean Return FilePath.EndsWith("\") End Function ''' <summary> ''' ensures that the path, if it contains a filename, matches one of the semicolon delimited ''' filetypes provided in fileExtension ''' </summary> Private Sub ValidateFilename(ByVal FilePath As String, ByVal fileExtensions As String) If IsDirectory(FilePath) Then Return Dim ext As String = Path.GetExtension(FilePath) If ext = "" Then Throw New Exception("The filename provided, '" & Path.GetFileName(FilePath) & _ "', has no extension. If are specifying a folder, make sure it ends in a trailing slash. " & _ "The expected file extension(s) are '" & fileExtensions & "'") End If If Not Regex.IsMatch(fileExtensions, ext & "(;|$)", RegexOptions.IgnoreCase) Then Throw New Exception("The extension of the filename provided, '" & Path.GetFileName(FilePath) & _ "', does not have the expected extension(s) '" & fileExtensions & "'") End If End Sub ''' <summary> ''' removes all unsafe filesystem characters to form a valid filesystem filename ''' </summary> Private Function MakeValidFilename(ByVal s As String) As String '-- replace any invalid filesystem chars with underscore Return Regex.Replace(s, "[\/\\\:\*\?\""\<\>\|]", "_") End Function ''' <summary> ''' appends all downloaded files (from _ExternalFiles) to our MhtBuilder ''' </summary> ''' <param name="st">type of storage to use when downloading external files</param> ''' <param name="storagePath">path to use for downloaded external files</param> Private Sub AppendMhtFiles() For Each de As DictionaryEntry In WebFiles Dim ef As WebFile = DirectCast(de.Value, WebFile) AppendMhtFile(ef) Next AppendMhtBoundary() End Sub ''' <summary> ''' appends the Mht header, which includes the root HTML ''' </summary> Private Sub AppendMhtHeader(ByVal ef As WebFile) '-- clear the stringbuilder contents _MhtBuilder = New StringBuilder AppendMhtLine("From: <Saved by " & Environment.UserName & " on " & Environment.MachineName & ">") AppendMhtLine("Subject: " & ef.HtmlTitle) AppendMhtLine("Date: " & DateTime.Now.ToString("ddd, dd MMM yyyy HH:mm:ss zzz")) AppendMhtLine("MIME-Version: 1.0") AppendMhtLine("Content-Type: multipart/related;") AppendMhtLine(Convert.ToChar(9) & "type=""text/html"";") AppendMhtLine(Convert.ToChar(9) & "boundary=""" & _MimeBoundaryTag & """") AppendMhtLine("X-MimeOLE: Produced by " & Me.GetType.ToString & " " & _ Reflection.Assembly.GetExecutingAssembly.GetName.Version.ToString()) AppendMhtLine() AppendMhtLine("This is a multi-part message in MIME format.") AppendMhtFile(ef) End Sub ''' <summary> ''' append a single line, with trailing CRLF, to our MhtBuilder ''' </summary> Private Sub AppendMhtLine(Optional ByVal s As String = "") _MhtBuilder.Append(s) _MhtBuilder.Append(Environment.NewLine) End Sub ''' <summary> ''' appends a boundary marker to our MhtBuilder ''' </summary> Private Sub AppendMhtBoundary() AppendMhtLine() AppendMhtLine("--" & _MimeBoundaryTag) End Sub ''' <summary> ''' Appends a downloaded external file to our MhtBuilder ''' </summary> Private Sub AppendMhtFile(ByVal ef As WebFile) If ef.WasDownloaded And Not ef.WasAppended Then If ef.IsBinary Then AppendMhtBinaryFile(ef) Else AppendMhtTextFile(ef) End If End If ef.WasAppended = True End Sub ''' <summary> ''' Appends a downloaded external text file to our MhtBuilder using Quoted-Printable encoding ''' </summary> Private Sub AppendMhtTextFile(ByVal ef As WebFile) AppendMhtBoundary() AppendMhtLine("Content-Type: " & ef.ContentType & ";") AppendMhtLine(Convert.ToChar(9) & "charset=""" & ef.TextEncoding.WebName & """") AppendMhtLine("Content-Transfer-Encoding: quoted-printable") AppendMhtLine("Content-Location: " & ef.Url) AppendMhtLine() AppendMhtLine(QuotedPrintableEncode(ef.ToString, ef.TextEncoding)) End Sub ''' <summary> ''' Appends a downloaded external binary file to our MhtBuilder using Base64 encoding ''' </summary> Private Sub AppendMhtBinaryFile(ByVal ef As WebFile) AppendMhtBoundary() AppendMhtLine("Content-Type: " & ef.ContentType) AppendMhtLine("Content-Transfer-Encoding: base64") AppendMhtLine("Content-Location: " & ef.Url) AppendMhtLine() '-- note that chunk size is equal to maximum line width (expanded = 75 chars) Const ChunkSize As Integer = 57 If ef.Storage = FileStorage.Memory Then Dim len As Integer = ef.DownloadedBytes.Length If len <= ChunkSize Then AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, 0, len)) Else Dim i As Integer = 0 Do While i + ChunkSize < len AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, i, ChunkSize)) i += ChunkSize Loop If i <> len Then AppendMhtLine(Convert.ToBase64String(ef.DownloadedBytes, i, len - i)) End If End If Else Dim fs As IO.FileStream Dim b(ChunkSize) As Byte Dim BytesRead As Integer Try fs = New FileStream(ef.DownloadPath, FileMode.Open, FileAccess.Read) BytesRead = fs.Read(b, 0, ChunkSize) Do While BytesRead > 0 AppendMhtLine(Convert.ToBase64String(b, 0, BytesRead)) BytesRead = fs.Read(b, 0, ChunkSize) Loop Finally If Not fs Is Nothing Then fs.Close() End If End Try End If End Sub ''' <summary> ''' dumps our MhtBuilder to disk and clears it ''' </summary> Private Sub FinalizeMht(ByVal outputFilePath As String) Dim sr As New StreamWriter(outputFilePath, False, _HtmlFile.TextEncoding) sr.Write(_MhtBuilder.ToString) sr.Close() _MhtBuilder = Nothing End Sub ''' <summary> ''' dumps our MhtBuilder as a string and clears it ''' </summary> Private Function FinalizeMht() As String Dim s As String = _MhtBuilder.ToString _MhtBuilder = Nothing Return s End Function #End Region #Region " Quoted-Printable encoding" ''' <summary> ''' converts a string into Quoted-Printable encoding ''' http://www.freesoft.org/CIE/RFC/1521/6.htm ''' </summary> Private Function QuotedPrintableEncode(ByVal s As String, ByVal e As System.Text.Encoding) As String Dim Ascii As Integer Dim LastSpace As Integer = 0 Dim LineLength As Integer = 0 Dim LineBreaks As Integer = 0 Dim sb As New StringBuilder Dim longchar As String If s Is Nothing OrElse s.Length = 0 Then Return "" End If For Each c As Char In s Ascii = Convert.ToInt32(c) If Ascii = 61 Or Ascii > 126 Then If Ascii <= 255 Then sb.Append("=") sb.Append(Convert.ToString(Ascii, 16).ToUpper) LineLength += 3 Else '-- double-byte land.. For Each b As Byte In e.GetBytes(c) sb.Append("=") sb.Append(Convert.ToString(b, 16).ToUpper) LineLength += 3 Next End If Else sb.Append(c) LineLength += 1 If Ascii = 32 Then LastSpace = sb.Length End If If LineLength >= 73 Then If LastSpace = 0 Then sb.Insert(sb.Length, "=" & Environment.NewLine) LineLength = 0 Else sb.Insert(LastSpace, "=" & Environment.NewLine) LineLength = sb.Length - LastSpace - 1 End If LineBreaks += 1 LastSpace = 0 End If Next '-- if we split the line, have to indicate trailing spaces If LineBreaks > 0 Then If sb.Chars(sb.Length - 1) = " " Then sb.Remove(sb.Length - 1, 1) sb.Append("=20") End If End If Return sb.ToString End Function #End Region End Class
bytes=23544