Tweak to cope with different DB file names
[debian-cd-search.git] / split_web
1 #!/bin/sh
2
3 LANGUAGES=en
4
5 for LANG in ${LANGUAGES}; do
6     rm -f *er?.${LANG}.html
7     wget -q -O- http://www.debian.org/CD/index.${LANG}.html | awk -v LANG=$LANG '
8
9         /^<div id="header">/               {state="header1"}
10         /^<p id="breadcrumbs">/            {state=""}
11
12         /^<\/div> <!-- end header -->/     {state="header2"}
13         /^<div id="maincol">/              {state=""}
14
15         /^<\/div> <!-- end maincol -->/    {state="footer1"}
16         /^<div id="pageLang">/             {state=""}
17
18         /\/MailingLists\/disclaimer/       {state="footer2"}
19         /^<\/div> <!-- end footermap -->/  {print $0 >> "footer2."LANG".html";state=""}
20
21 /.*/ {
22          gsub("\"\\.\\./","\"https://www.debian.org/")
23          gsub("\"/\"","\"https://www.debian.org/\"")
24          gsub("/MailingLists/","https://www.debian.org/MailingLists/")
25          if (state == "header1") { print $0 >> "header1."LANG".html" }
26          if (state == "header2") { print $0 >> "header2."LANG".html" }
27          if (state == "footer1") { print $0 >> "footer1."LANG".html" }
28          if (state == "footer2") { print $0 >> "footer2."LANG".html" }
29      }'
30
31 done
32