From b82c819afb5271de2d861696cc8c2a08d3487f85 Mon Sep 17 00:00:00 2001 From: tomsellers Date: Tue, 10 Jul 2012 00:23:02 +0000 Subject: [PATCH] Update to add additional blacklist entries the httpspider library. The goal is to avoid downloading and processing certain additional video, audio and binary formats. This should speed up crawling certain sites. In the case of http-email-harvest it should reduce some of the false positives generated by running the RegEx against binary data. The only script that this appears likely to have affected the results of would have been http-sitemap-generator and that script specifically disables the blacklist. --- nselib/httpspider.lua | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/nselib/httpspider.lua b/nselib/httpspider.lua index c240fffc7..fc010f9c8 100644 --- a/nselib/httpspider.lua +++ b/nselib/httpspider.lua @@ -596,11 +596,13 @@ Crawler = { addDefaultBlacklist = function(self) local extensions = { image_extensions = {"png","jpg","jpeg","gif","bmp"}, + video_extensions = {"avi","flv","ogg","mp4","wmv"}, + audio_extensions = {"aac","m4a","mp3","wav"}, doc_extensions = {"pdf", "doc", "docx", "docm", "xls", "xlsx", "xlsm", "ppt", "pptx", "pptm", "odf", "ods", "odp", "ps", "xps"}, archive_extensions = {"zip", "tar.gz", "gz", "rar", "7z", "sit", "sitx", "tgz", "tar.bz", "tar", "iso"}, - exe_extensions = {"exe", "com", "msi", "bin"} + exe_extensions = {"exe", "com", "msi", "bin","dmg"} } local blacklist = {} for _, cat in pairs(extensions) do