3 # |_) /| Copyright (C) 2000 | richard@
4 # | \/¯| Richard Atterer | atterer.org
6 # This program is free software; you can redistribute it and/or modify
7 # it under the terms of the GNU General Public License, version 2. See
8 # the file COPYING for details.
10 function appendWord(word, spaceAfterWord) {
11 #print "appendWord \"" word "\" \"" gensub(/\n/, "\\\\n", "g", spaceAfterWord) "\"";
12 if (prevSpaceAfterWord == "\n") {
13 # Linebreak while inside <pre>
14 doc = doc substr(indentStr, 1, ind) docLine "\n";
17 prevSpaceAfterWord = spaceAfterWord;
21 if (ind + length(docLine) + length(word) < curMaxLen) {
23 if (word != "" || doPreserve > 0)
24 docLine = docLine prevSpaceAfterWord word;
25 prevSpaceAfterWord = spaceAfterWord;
28 if (docLine != "") doc = doc substr(indentStr, 1, ind) docLine "\n";
29 #print ">>> " docLine;
32 prevSpaceAfterWord = spaceAfterWord;
35 #______________________________________________________________________
42 indentStr = " "; # Won't indent by more than this
43 killClass = 1; # If nonzero, remove all " class=...>" attributes
45 # Only tags that come with closing tags are allowed!
46 tags["html"]=1; tags["body"]=1; tags["head"]; tags["title"];
47 tags["div"]; tags["h1"]; tags["h2"]; tags["h3"]; tags["h4"]; tags["h5"];
48 tags["h6"]; tags["p"]; tags["dl"]; tags["dt"]; tags["dd"]; tags["table"];
49 tags["tr"]; tags["td"];
56 while ((getline line < get) == 1)
57 rest = rest line "\n";
60 gsub(/[ \t\n]+(class|CLASS)=("[^"]*"|'[^']*')[ \t\n]*>/, ">", rest); #"
62 # Split lines at whitespace and some tags
63 nextInd = ind = 0; # Nr of characters of indentation
64 doc = ""; # Ouput document
65 docLine = ""; # Current line to append words to
66 doPreserve = 0; # Nesting level of <pre>
67 while (match(rest, /([ \n\t]+|< *(\/ *)?)/)) {
68 #print "MATCH \"" substr(rest, RSTART, RLENGTH) "\"";
69 #print "xxx "nextInd" " gensub(/\n/, "\\\\n", "g", substr(rest, 1, 90));
71 if (substr(rest, RSTART, 1) == "<") {
73 tagName = tolower(substr(rest, RSTART + RLENGTH, maxTagLength));
74 gsub(/[^a-z0-9].*$/, "", tagName);
75 closing = index(substr(rest, RSTART + 1, RLENGTH - 1), "/");
78 if (tagName in preserve) {
79 if (closing && doPreserve > 0) {
80 appendWord(substr(rest, 1, RSTART + RLENGTH + length(tagName) \
84 if (doPreserve == 0) {
85 curMaxLen = maxLen; nextInd = nonpreserveInd;
87 rest = substr(rest, RSTART + RLENGTH + length(tagName));
91 # Disable indentation while inside <pre>
92 if (doPreserve == 0) { nonpreserveInd = nextInd; nextInd = 0; }
93 ++doPreserve; curMaxLen = 9999999;
97 if (!(tagName in tags)) {
99 appendWord(substr(rest, 1, RSTART + RLENGTH + length(tagName) - 1),
101 rest = substr(rest, RSTART + RLENGTH + length(tagName));
103 } else if (closing) {
104 #print "---/" tagName;
106 if (tags[tagName] == 0) {
107 appendWord(substr(rest, 1, RSTART + RLENGTH + length(tagName) \
112 appendWord(substr(rest, 1, RSTART - 1), "");
113 curMaxLen = 0; # Force new line with next appendWord()
114 appendWord(substr(rest, RSTART, RLENGTH + length(tagName)), "");
118 #print "--- " tagName;
120 appendWord(substr(rest, 1, RSTART - 1), "");
121 curMaxLen = 0; # Force new line with next appendWord()
122 appendWord(substr(rest, RSTART, RLENGTH + length(tagName)), "");
126 rest = substr(rest, RSTART + RLENGTH + length(tagName));
131 #print "dop " doPreserve ", RSTART=" RSTART ", RLENGTH=" RLENGTH;
133 # Preserve spaces and newlines in output
134 appendWord(substr(rest, 1, RSTART - 1), substr(rest, RSTART, 1));
135 rest = substr(rest, RSTART + 1);
138 if (substr(rest, RSTART + RLENGTH, 1) == ">")
139 appendWord(substr(rest, 1, RSTART - 1), "");
141 appendWord(substr(rest, 1, RSTART - 1), " ");
142 rest = substr(rest, RSTART + RLENGTH);
146 doc = doc substr(indentStr, 1, ind) docLine rest;