Add scraper to grab bits of text and style from www.d.o/CD/
authorSteve McIntyre <steve@einval.com>
Thu, 14 Apr 2011 20:44:48 +0000 (21:44 +0100)
committerSteve McIntyre <steve@einval.com>
Thu, 14 Apr 2011 20:44:48 +0000 (21:44 +0100)
Grab http://www.debian.org/CD/index.$LANG.html, split up and
search/replace some of the text to make it useful for the search CGI.

split_web [new file with mode: 0755]

diff --git a/split_web b/split_web
new file mode 100755 (executable)
index 0000000..db4879f
--- /dev/null
+++ b/split_web
@@ -0,0 +1,31 @@
+#!/bin/sh
+
+LANGUAGES=en
+
+for LANG in ${LANGUAGES}; do
+    rm -f *er?.${LANG}.html
+    wget -q -O- http://www.debian.org/CD/index.${LANG}.html | awk -v LANG=$LANG '
+
+        /^<div id="header">/               {state="header1"}
+        /^<p id="breadcrumbs">/            {state=""}
+
+        /^<\/div> <!-- end header -->/     {state="header2"}
+        /^<div id="maincol">/              {state=""}
+
+        /^<\/div> <!-- end maincol -->/    {state="footer1"}
+        /^<div id="pageLang">/             {state=""}
+
+        /\/MailingLists\/disclaimer/       {state="footer2"}
+        /^<\/div> <!-- end footermap -->/  {print $0 >> "footer2."LANG".html";state=""}
+
+/.*/ {
+         gsub("\"\\.\\./","\"http://www.debian.org/")
+         gsub("\"/\"","\"http://www.debian.org/\"")
+         gsub("/MailingLists/","http://www.debian.org/MailingLists/")
+         if (state == "header1") { print $0 >> "header1."LANG".html" }
+         if (state == "header2") { print $0 >> "header2."LANG".html" }
+         if (state == "footer1") { print $0 >> "footer1."LANG".html" }
+         if (state == "footer2") { print $0 >> "footer2."LANG".html" }
+     }'
+
+done
\ No newline at end of file