Imports System.Text.RegularExpressions Imports System.IO ''' <summary> ''' represents an external file referenced in our parent HTML at the target URL ''' </summary> ''' <remarks> ''' Jeff Atwood ''' http://www.codinghorror.com/ ''' </remarks> Friend Class WebFile Private _Builder As Builder Private _Url As String Private _UrlUnmodified As String Private _UrlRoot As String Private _UrlFolder As String Private _ContentType As String Private _IsBinary As Boolean Private _TextEncoding As System.Text.Encoding Private _ContentLocation As String Private _DownloadedBytes() As Byte Private _DownloadException As Exception = Nothing Private _WasDownloaded As Boolean = False Private _DownloadFilename As String = "" Private _DownloadFolder As String = "" Private _DownloadExtension As String = "" Private _UseHtmlFilename As Boolean = False Private _ExternalFileCollection As Specialized.NameValueCollection Public Storage As Builder.FileStorage = Builder.FileStorage.DiskPermanent Public WasAppended As Boolean = False Public Sub New(ByVal parent As Builder, ByVal st As Builder.FileStorage) Me.Storage = st _Builder = parent End Sub Public Sub New(ByVal parent As Builder, ByVal url As String, ByVal st As Builder.FileStorage) _Builder = parent If url <> "" Then Me.Url = url End If Me.Storage = st End Sub ''' <summary> ''' The URL target for this file ''' </summary> Public Property Url() As String Get Return _Url End Get Set(ByVal Value As String) _UrlUnmodified = Value SetUrl(Value, True) ReDim _DownloadedBytes(0) _ExternalFileCollection = Nothing _DownloadException = Nothing _TextEncoding = Nothing _ContentType = "" _ContentLocation = "" _IsBinary = False _WasDownloaded = False End Set End Property ''' <summary> ''' If enabled, will use the first 50 characters of the TITLE tag ''' to form the filename when saved to disk ''' </summary> Public Property UseHtmlTitleAsFilename() As Boolean Get Return _UseHtmlFilename End Get Set(ByVal Value As Boolean) _UseHtmlFilename = Value End Set End Property ''' <summary> ''' the folder name used in the DownloadFolder ''' </summary> Public ReadOnly Property DownloadFolderName() As String Get Return Regex.Match(Me.DownloadFolder, "(?<Folder>[^\\]+)\\*$").Groups("Folder").Value End Get End Property ''' <summary> ''' folder to download this file to ''' if no folder is provided, the current application folder will be used ''' </summary> Public Property DownloadFolder() As String Get If _DownloadFolder = "" Then _DownloadFolder = AppDomain.CurrentDomain.BaseDirectory End If Return _DownloadFolder End Get Set(ByVal Value As String) _DownloadFolder = Value End Set End Property ''' <summary> ''' filename to download this file as ''' if no filename is provided, a filename will be auto-generated based on ''' the URL; if the UseHtmlTitleAsFilename property is true, then the ''' title tag will be used to generate the filename ''' </summary> Public Property DownloadFilename() As String Get If _DownloadFilename = "" Then If _UseHtmlFilename AndAlso (Me.WasDownloaded AndAlso Me.IsHtml) Then Dim htmlTitle As String = Me.HtmlTitle If htmlTitle <> "" Then _DownloadFilename = MakeValidFilename(htmlTitle) & ".htm" End If Else _DownloadFilename = FilenameFromUrl() End If End If Return _DownloadFilename End Get Set(ByVal Value As String) _DownloadFilename = Value End Set End Property ''' <summary> ''' fully qualified path and filename to download this file to ''' </summary> Public Property DownloadPath() As String Get If Path.GetExtension(Me.DownloadFilename) = "" Then Return Path.Combine(Me.DownloadFolder, Me.DownloadFilename & Me.DownloadExtension) Else Return Path.Combine(Me.DownloadFolder, Me.DownloadFilename) End If End Get Set(ByVal Value As String) _DownloadFilename = Path.GetFileName(Value) If _DownloadFilename = "" Then _DownloadFolder = Value Else _DownloadFolder = Value.Replace(_DownloadFilename, "") End If End Set End Property ''' <summary> ''' file type extension to use on downloaded file ''' this property is only used if the DownloadFilename property does not ''' already contain a file extension ''' </summary> Public Property DownloadExtension() As String Get If _DownloadExtension = "" Then If Me.WasDownloaded Then _DownloadExtension = ExtensionFromContentType() End If End If Return _DownloadExtension End Get Set(ByVal Value As String) _DownloadExtension = Value End Set End Property ''' <summary> ''' If this file has external dependencies, the folder they will be stored on disk ''' </summary> Public ReadOnly Property ExternalFilesFolder() As String Get Return Path.Combine(Me.DownloadFolder, Path.GetFileNameWithoutExtension(Me.DownloadFilename)) & "_files" End Get End Property ''' <summary> ''' The unmodified "raw" URL as originally provided ''' </summary> Public ReadOnly Property UrlUnmodified() As String Get Return _UrlUnmodified End Get End Property ''' <summary> ''' The Content-Location of this URL as provided by the server, ''' only if the URL was not fully qualified; ''' eg, http://mywebsite.com/ actually maps to http://mywebsite.com/default.htm ''' </summary> Public ReadOnly Property UrlContentLocation() As String Get Return _ContentLocation End Get End Property ''' <summary> ''' The root of the URL, eg, http://mywebsite.com/ ''' </summary> Public ReadOnly Property UrlRoot() As String Get Return _UrlRoot End Get End Property ''' <summary> ''' The root and folder of the URL, eg, http://mywebsite.com/myfolder ''' </summary> Public ReadOnly Property UrlFolder() As String Get Return _UrlFolder End Get End Property ''' <summary> ''' Was this file successfully downloaded via HTTP? ''' </summary> Public ReadOnly Property WasDownloaded() As Boolean Get Return _WasDownloaded End Get End Property ''' <summary> ''' The Content-Type of this file as returned by the server ''' </summary> Public ReadOnly Property ContentType() As String Get Return _ContentType End Get End Property ''' <summary> ''' Does this file contain binary data? If not, it must be text data. ''' </summary> Public ReadOnly Property IsBinary() As Boolean Get Return _IsBinary End Get End Property ''' <summary> ''' The raw bytes returned from the server for this file ''' </summary> Public ReadOnly Property DownloadedBytes() As Byte() Get Return _DownloadedBytes End Get End Property ''' <summary> ''' If not .WasDownloaded, the exception that prevented download is stored here ''' </summary> Public ReadOnly Property DownloadException() As Exception Get Return _DownloadException End Get End Property ''' <summary> ''' If this file is text (eg, it isn't binary), the type of text encoding used ''' </summary> Public ReadOnly Property TextEncoding() As System.Text.Encoding Get Return _TextEncoding End Get End Property ''' <summary> ''' Is this file HTML content? ''' </summary> Public ReadOnly Property IsHtml() As Boolean Get Return Regex.IsMatch(_ContentType, "text/html", RegexOptions.IgnoreCase) End Get End Property ''' <summary> ''' Is this file CSS content? ''' </summary> Public ReadOnly Property IsCss() As Boolean Get Return Regex.IsMatch(_ContentType, "text/css", RegexOptions.IgnoreCase) End Get End Property ''' <summary> ''' If this file is HTML, retrieve the <TITLE> tag from the HTML ''' (maximum of 50 characters) ''' </summary> Public ReadOnly Property HtmlTitle() As String Get If Not Me.IsHtml Then Throw New Exception("This file isn't HTML, so it has no HTML <TITLE> tag.") End If Const maxLength As Integer = 50 Dim s As String = _ Regex.Match(Me.ToString, "<title[^>]*?>(?<text>[^<]+)</title>", _ RegexOptions.IgnoreCase Or RegexOptions.Singleline).Groups("text").Value() If s.Length > maxLength Then Return s.Substring(0, maxLength) Else Return s End If End Get End Property ''' <summary> ''' Returns a string representation of the data downloaded for this file ''' </summary> Public Overrides Function ToString() As String If Not _WasDownloaded Then Download() End If If Not _WasDownloaded Then Return "" Else If _DownloadedBytes.Length > 0 Then If _IsBinary Then Return "[" & _DownloadedBytes.Length & " bytes of binary data]" Else Return TextEncoding.GetString(_DownloadedBytes) End If End If End If End Function ''' <summary> ''' Download this file from the target URL ''' </summary> Public Sub Download() Debug.Write("Downloading " & _Url & " ..") DownloadBytes() If _DownloadException Is Nothing Then Debug.WriteLine("OK") Else Debug.WriteLine("failed: ", "Error") Debug.WriteLine(" " & _DownloadException.Message, "Error") Return End If If Me.IsHtml Then _DownloadedBytes = _TextEncoding.GetBytes(ProcessHtml(Me.ToString)) End If If Me.IsCss Then _DownloadedBytes = _TextEncoding.GetBytes(ProcessHtml(Me.ToString)) End If If Me.Storage <> Builder.FileStorage.Memory Then Me.SaveToFile() End If End Sub ''' <summary> ''' download this file from the target URL; ''' place the bytes downloaded in _DownloadedBytes ''' if an exception occurs, capture it in _DownloadException ''' </summary> Private Sub DownloadBytes() If Me.WasDownloaded Then Return '-- always download to memory first Try _DownloadedBytes = _Builder.WebClient.DownloadBytes(_Url) _WasDownloaded = True Catch ex As Net.WebException _DownloadException = ex _Builder.WebClient.ClearDownload() End Try '-- necessary if the original client URL was imprecise; '-- server location is always authoritatitve If _Builder.WebClient.ContentLocation <> "" Then _ContentLocation = _Builder.WebClient.ContentLocation SetUrl(_ContentLocation, False) End If _IsBinary = _Builder.WebClient.ResponseIsBinary _ContentType = _Builder.WebClient.ResponseContentType _TextEncoding = _Builder.WebClient.DetectedEncoding _Builder.WebClient.ClearDownload() End Sub Private Sub SetUrl(ByVal url As String, ByVal validate As Boolean) If validate Then _Url = ResolveUrl(url) Else _Url = url End If '-- http://mywebsite _UrlRoot = Regex.Match(url, "http://[^/'""]+", RegexOptions.IgnoreCase).ToString '-- http://mywebsite/myfolder If _Url.LastIndexOf("/") > 7 Then _UrlFolder = _Url.Substring(0, _Url.LastIndexOf("/")) Else _UrlFolder = _UrlRoot End If End Sub ''' <summary> ''' Pre-process the CSS using global preference settings ''' </summary> Private Function ProcessCss(ByVal css As String) As String Return ConvertRelativeToAbsoluteRefs(css) End Function ''' <summary> ''' Pre-process the HTML using global preference settings ''' </summary> Private Function ProcessHtml(ByVal html As String) As String Debug.WriteLine("Downloaded content was HTML/CSS -- processing: resolving URLs, getting <base>, etc") If _Builder.AddWebMark Then '-- add "mark of the web": '-- http://www.microsoft.com/technet/prodtechnol/winxppro/maintain/sp2brows.mspx#XSLTsection133121120120 html = "<!-- saved from url=(" & String.Format("{0:0000}", _Url.Length) & ")" & _Url & " -->" & _ Environment.NewLine & html End If '-- see if we need to strip elements from the HTML If _Builder.StripScripts Then html = StripHtmlTag("script", html) End If If _Builder.StripIframes Then html = StripHtmlTag("iframe", html) End If '-- if we have a <base>, we must use it as the _UrlFolder, '-- not what was parsed from the original _Url Dim BaseUrlFolder As String = _ Regex.Match(html, _ "<base[^>]+?href=['""]{0,1}(?<BaseUrl>[^'"">]+)['""]{0,1}", _ RegexOptions.IgnoreCase).Groups("BaseUrl").Value If BaseUrlFolder <> "" Then If BaseUrlFolder.EndsWith("/") Then _UrlFolder = BaseUrlFolder.Substring(0, BaseUrlFolder.Length - 1) Else _UrlFolder = BaseUrlFolder End If End If '-- remove the <base href=''> tag if present; causes problems when viewing locally. html = Regex.Replace(html, "<base[^>]*?>", "") '-- relative URLs are a PITA for the processing we're about to do, '-- so convert them all to absolute up front Return ConvertRelativeToAbsoluteRefs(html) End Function ''' <summary> ''' converts all relative url references ''' href="myfolder/mypage.htm" ''' into absolute url references ''' href="http://mywebsite/myfolder/mypage.htm" ''' </summary> Private Function ConvertRelativeToAbsoluteRefs(ByVal html As String) As String Dim r As Regex Dim urlPattern As String = _ "(?<attrib>\shref|\ssrc|\sbackground)\s*?=\s*?" & _ "(?<delim1>[""'\\]{0,2})(?!\s*\+|#|http:|ftp:|mailto:|javascript:)" & _ "/(?<url>[^""'>\\]+)(?<delim2>[""'\\]{0,2})" Dim cssPattern As String = _ "(?<attrib>@import\s|\S+-image:|background:)\s*?(url)*['""(]{1,2}" & _ "(?!http)\s*/(?<url>[^""')]+)['"")]{1,2}" '-- href="/anything" to href="http://www.web.com/anything" r = New Regex(urlPattern, _ RegexOptions.IgnoreCase Or RegexOptions.Multiline) html = r.Replace(html, "${attrib}=${delim1}" & _UrlRoot & "/${url}${delim2}") '-- href="anything" to href="http://www.web.com/folder/anything" r = New Regex(urlPattern.Replace("/", ""), _ RegexOptions.IgnoreCase Or RegexOptions.Multiline) html = r.Replace(html, "${attrib}=${delim1}" & _UrlFolder & "/${url}${delim2}") '-- @import(/anything) to @import url(http://www.web.com/anything) r = New Regex(cssPattern, _ RegexOptions.IgnoreCase Or RegexOptions.Multiline) html = r.Replace(html, "${attrib} url(" & _UrlRoot & "/${url})") '-- @import(anything) to @import url(http://www.web.com/folder/anything) r = New Regex(cssPattern.Replace("/", ""), _ RegexOptions.IgnoreCase Or RegexOptions.Multiline) html = r.Replace(html, "${attrib} url(" & _UrlFolder & "/${url})") Return html End Function ''' <summary> ''' returns a name/value collection of all external files referenced in HTML: ''' ''' "/myfolder/blah.png" ''' 'http://mywebsite/blah.gif' ''' src=blah.jpg ''' ''' note that the Key includes the delimiting quotes or parens (if present), but the Value does not ''' this is important because the delimiters are used for matching and replacement to make the ''' match more specific! ''' </summary> Private Function ExternalHtmlFiles() As Specialized.NameValueCollection '-- avoid doing this work twice, however, be careful that the HTML hasn't '-- changed since the last time we called this function If Not _ExternalFileCollection Is Nothing Then Return _ExternalFileCollection End If _ExternalFileCollection = New Specialized.NameValueCollection Dim r As Regex Dim html As String = Me.ToString Debug.WriteLine("Resolving all external HTML references from URL:") Debug.WriteLine(" " & Me.Url) '-- src='filename.ext' ; background="filename.ext" '-- note that we have to test 3 times to catch all quote styles: '', "", and none r = New Regex( _ "(\ssrc|\sbackground)\s*=\s*((?<Key>'(?<Value>[^']+)')|(?<Key>""(?<Value>[^""]+)"")|(?<Key>(?<Value>[^ \n\r\f]+)))", _ RegexOptions.IgnoreCase Or RegexOptions.Multiline) AddMatchesToCollection(html, r, _ExternalFileCollection) '-- @import "style.css" or @import url(style.css) r = New Regex( _ "(@import\s|\S+-image:|background:)\s*?(url)*\s*?(?<Key>[""'(]{1,2}(?<Value>[^""')]+)[""')]{1,2})", _ RegexOptions.IgnoreCase Or RegexOptions.Multiline) AddMatchesToCollection(html, r, _ExternalFileCollection) '-- <link rel=stylesheet href="style.css"> r = New Regex( _ "<link[^>]+?href\s*=\s*(?<Key>('|"")*(?<Value>[^'"">]+)('|"")*)", _ RegexOptions.IgnoreCase Or RegexOptions.Multiline) AddMatchesToCollection(html, r, _ExternalFileCollection) '-- <iframe src="mypage.htm"> or <frame src="mypage.aspx"> r = New Regex( _ "<i*frame[^>]+?src\s*=\s*(?<Key>['""]{0,1}(?<Value>[^'""\\>]+)['""]{0,1})", _ RegexOptions.IgnoreCase Or RegexOptions.Multiline) AddMatchesToCollection(html, r, _ExternalFileCollection) Return _ExternalFileCollection End Function ''' <summary> ''' perform the regex replacement of all <tagName> .. </tagName> blocks ''' </summary> Private Function StripHtmlTag(ByVal tagName As String, ByVal html As String) As String Dim reg As Regex = New Regex( _ String.Format("<{0}[^>]*?>[\w|\t|\r|\W]*?</{0}>", tagName), _ RegexOptions.Multiline Or RegexOptions.IgnoreCase) Return reg.Replace(html, "") End Function ''' <summary> ''' Returns the plain text representation of the data in this file, ''' stripping out any HTML tags and codes ''' </summary> Public Function ToTextString(Optional ByVal removeWhitespace As Boolean = False) As String Dim html As String = Me.ToString '-- get rid of <script> .. </script> html = StripHtmlTag("script", html) '-- get rid of <style> .. </style> html = StripHtmlTag("style", html) '-- get rid of all HTML tags html = Regex.Replace(html, "<\w+(\s+[A-Za-z0-9_\-]+\s*=\s*(""([^""]*)""|'([^']*)'))*\s*(/)*>|<[^>]+>", " ") '-- convert escaped HTML to plaintext html = Web.HttpUtility.HtmlDecode(html) If removeWhitespace Then '-- clean up whitespace (optional, depends what you want..) html = Regex.Replace(html, "[\n\r\f\t]", " ", RegexOptions.Multiline) html = Regex.Replace(html, " {2,}", " ", RegexOptions.Multiline) End If Return html End Function ''' <summary> ''' Saves this file to disk as a plain text file ''' </summary> Public Sub SaveAsTextFile() SaveToFile(Path.ChangeExtension(Me.DownloadPath, ".txt"), True) End Sub ''' <summary> ''' Saves this file to disk as a plain text file, to an arbitrary path ''' </summary> Public Sub SaveAsTextFile(ByVal filePath As String) SaveToFile(filePath, True) End Sub ''' <summary> ''' writes contents of file to DownloadPath, using appropriate encoding as necessary ''' </summary> Public Sub SaveToFile() SaveToFile(Me.DownloadPath, False) End Sub ''' <summary> ''' writes contents of file to DownloadPath, using appropriate encoding as necessary ''' </summary> Public Sub SaveToFile(ByVal filePath As String) SaveToFile(filePath, False) End Sub ''' <summary> ''' sets the DownloadPath and writes contents of file, using appropriate encoding as necessary ''' </summary> Private Sub SaveToFile(ByVal filePath As String, ByVal asText As Boolean) Debug.WriteLine("Saving to file " & filePath) Dim fs As New FileStream(filePath, FileMode.OpenOrCreate) Try Dim bw As New BinaryWriter(fs) If Me.IsBinary Then bw.Write(_DownloadedBytes) Else If asText Then bw.Write(Me.ToTextString) Else bw.Write(_DownloadedBytes) End If End If bw.Close() Finally If Not fs Is Nothing Then fs.Close() End If End Try End Sub ''' <summary> ''' fully resolves any relative pathing inside the URL, and other URL oddities ''' </summary> Private Function ResolveUrl(ByVal url As String) As String '-- resolve any relative pathing Try url = (New System.Uri(url)).AbsoluteUri Catch ex As System.UriFormatException Throw New ArgumentException("'" & url & "' does not appear to be a valid URL.", ex) End Try '-- remove any anchor tags from the end of URLs If url.IndexOf("#") > -1 Then Dim jump As String = Regex.Match(url, "/[^/]*?(?<jump>#[^/?.]+$)").Groups("jump").Value If jump <> "" Then url = url.Replace(jump, "") End If End If Return url End Function ''' <summary> ''' if the user passed in a directory, form the filename automatically using the Html title tag ''' if the user passed in a filename, make sure the extension matches our desired extension ''' </summary> Private Function DeriveFilename(ByVal FilePath As String, _ ByVal html As String, _ ByVal fileExtension As String) As String If IsDirectory(FilePath) Then Dim htmlTitle As String = Me.HtmlTitle If htmlTitle = "" Then Throw New Exception("No filename was provided, and the HTML title tag was not found, " & _ "so a filename could not be automatically generated. You'll need to provide a filename and not a folder.") End If FilePath = Path.Combine(Path.GetDirectoryName(FilePath), MakeValidFilename(htmlTitle) & fileExtension) Else If Path.GetExtension(FilePath) <> fileExtension Then Return Path.ChangeExtension(FilePath, fileExtension) End If End If Return FilePath End Function ''' <summary> ''' removes all unsafe filesystem characters to form a valid filesystem filename ''' </summary> Private Function MakeValidFilename(ByVal s As String, Optional ByVal enforceLength As Boolean = False) As String If enforceLength Then End If '-- replace any invalid filesystem chars, plus leading/trailing/doublespaces Return Regex.Replace(Regex.Replace(s, "[\/\\\:\*\?\""""\<\>\|]|^\s+|\s+$", ""), "\s{2,}", " ") End Function ''' <summary> ''' returns true if this path refers to a directory (vs. a filename) ''' </summary> Private Function IsDirectory(ByVal FilePath As String) As Boolean Return FilePath.EndsWith("\") End Function ''' <summary> ''' converts all external Html files (gif, jpg, css, etc) to local refs ''' external ref: ''' <img src="http://mywebsite/myfolder/myimage.gif"> ''' into local refs: ''' <img src="mypage_files/myimage.gif"> ''' </summary> Public Sub ConvertReferencesToLocal() If Not Me.IsHtml And Not Me.IsCss Then Throw New Exception("Converting references only makes sense for HTML or CSS files; this file is of type '" & Me.ContentType & "'") End If '-- get a list of all external references Dim html As String = Me.ToString Dim FileCollection As Specialized.NameValueCollection = Me.ExternalHtmlFiles() '-- no external refs? nothing to do If FileCollection.Count = 0 Then Return Dim FolderName As String Dim FileUrl As String For Each DelimitedFileUrl As String In FileCollection.AllKeys FileUrl = FileCollection.Item(DelimitedFileUrl) If _Builder.WebFiles.Contains(FileUrl) Then Dim wf As WebFile = DirectCast(_Builder.WebFiles.Item(FileUrl), WebFile) Dim NewPath As String = Me.ExternalFilesFolder & "/" & wf.DownloadFilename Dim DelimitedReplacement As String = Regex.Replace(DelimitedFileUrl, _ "^(?<StartDelim>""|'|\()*(?<Value>[^'"")]*)(?<EndDelim>""|'|\))*$", _ "${StartDelim}" & NewPath & "${EndDelim}") '-- correct original Url references in Html so they point to our local files html = html.Replace(DelimitedFileUrl, DelimitedReplacement) End If Next _DownloadedBytes = _TextEncoding.GetBytes(html) End Sub ''' <summary> ''' appends key=value named matches in a regular expression ''' to a target NameValueCollection ''' </summary> Private Sub AddMatchesToCollection(ByVal s As String, _ ByVal r As Regex, _ ByRef nvc As Specialized.NameValueCollection) Dim key As String Dim value As String Dim headerDisplayed As Boolean = False Dim urlRegex As New Regex("^https*://\w+", RegexOptions.IgnoreCase) For Each m As Match In r.Matches(s) If Not headerDisplayed Then Debug.WriteLine("Matches added from regex:") Debug.WriteLine("""" & r.ToString & """") headerDisplayed = True End If key = m.Groups("Key").ToString value = m.Groups("Value").ToString If nvc.Item(key) Is Nothing Then Debug.WriteLine(" Match: " & m.ToString) Debug.WriteLine(" Key: " & key) Debug.WriteLine(" Value: " & value) If Not urlRegex.IsMatch(value) Then Debug.WriteLine("Match discarded; does not appear to be valid fully qualified http:// Url", "Error") Else nvc.Add(key, value) End If End If Next End Sub ''' <summary> ''' download ALL externally referenced files in this file's html, potentially recursively, ''' to the default download path for this page ''' </summary> Public Sub DownloadExternalFiles(ByVal st As Builder.FileStorage, Optional ByVal recursive As Boolean = False) DownloadExternalFiles(st, Me.ExternalFilesFolder, recursive) End Sub ''' <summary> ''' download ALL externally referenced files in this html, potentially recursively ''' to a specific download path ''' </summary> Private Sub DownloadExternalFiles(ByVal st As Builder.FileStorage, ByVal targetFolder As String, ByVal recursive As Boolean) Dim FileCollection As Specialized.NameValueCollection = ExternalHtmlFiles() If Not FileCollection.HasKeys Then Return Debug.WriteLine("Downloading all external files collected from URL:") Debug.WriteLine(" " & Url) For Each Key As String In FileCollection.AllKeys DownloadExternalFile(FileCollection.Item(Key), st, targetFolder, recursive) Next End Sub ''' <summary> ''' Download a single externally referenced file (if we haven't already downloaded it) ''' </summary> Private Sub DownloadExternalFile(ByVal url As String, ByVal st As Builder.FileStorage, _ ByVal targetFolder As String, Optional ByVal recursive As Boolean = False) Dim wf As WebFile Dim isNew As Boolean '-- have we already downloaded (or attempted to) this file? If _Builder.WebFiles.Contains(url) Or _Builder.Url = url Then wf = DirectCast(_Builder.WebFiles.Item(url), WebFile) isNew = False Else wf = New WebFile(_Builder, url, st) isNew = True End If '-- if we're planning to store this file on disk, make sure we can If st = Builder.FileStorage.DiskPermanent Or st = Builder.FileStorage.DiskTemporary Then If Not Directory.Exists(targetFolder) Then Directory.CreateDirectory(targetFolder) End If wf.DownloadFolder = targetFolder End If wf.Download() If isNew Then '-- add this (possibly) downloaded file to our shared collection _Builder.WebFiles.Add(wf.UrlUnmodified, wf) '-- if this is an HTML file, it has dependencies of its own; '-- download them into a subfolder If (wf.IsHtml Or wf.IsCss) And recursive Then wf.DownloadExternalFiles(st, recursive) End If End If End Sub ''' <summary> ''' attempt to get a coherent filename out of the Url ''' </summary> Private Function FilenameFromUrl() As String '-- first, try to get a filename out of the URL itself; '-- this means anything past the final slash that doesn't include another slash '-- or a question mark, eg http://mywebsite/myfolder/crazy?param=1¶m=2 Dim filename As String = Regex.Match(_Url, "/(?<Filename>[^/?]+)[^/]*$").Groups("Filename").Value If filename <> "" Then '-- that worked, but we need to make sure the filename is unique '-- if query params were passed to the URL file Dim u As New Uri(_Url) If u.Query <> "" Then filename = Path.GetFileNameWithoutExtension(filename) & "_" & u.Query.GetHashCode.ToString & Me.DownloadExtension End If End If '-- ok, that didn't work; if this file is HTML try to get the TITLE tag If filename = "" Then If Me.IsHtml Then filename = Me.HtmlTitle If filename <> "" Then filename &= ".htm" End If End If End If '-- now we're really desperate. Hash the URL and make that the filename. If filename = "" Then filename = _Url.GetHashCode.ToString & Me.DownloadExtension End If Return MakeValidFilename(filename) End Function ''' <summary> ''' if we weren't given a filename extension, infer it from the download ''' Content-Type header ''' </summary> ''' <remarks> ''' http://www.utoronto.ca/webdocs/HTMLdocs/Book/Book-3ed/appb/mimetype.html ''' </remarks> Private Function ExtensionFromContentType() As String Select Case Regex.Match(Me.ContentType, "^[^ ;]+").Value.ToLower Case "text/html" Return ".htm" Case "image/gif" Return ".gif" Case "image/jpeg" Return ".jpg" Case "text/javascript", "application/x-javascript" Return ".js" Case "image/x-png" Return ".png" Case "text/css" Return ".css" Case "text/plain" Return ".txt" Case Else Debug.WriteLine("Unknown content-type '" & Me.ContentType & "'", "Error") Return ".htm" End Select End Function End Class
bytes=34010