#! /bin/sh # # swan -- sub-web analyzer # # Copyright (c) 1995, Tom Verhoeff # VERSION='1.3b' # # New in version 1.3b (16 February 1998): # * HTML files named *.htm are included as well. # * Tags
and are recognized. # * Hyperlinks to external resources are grouped by scheme,netloc. # * Minor cosmetic changes in the HTML output (the cross references). # # New in version 1.2b (16 July 1996): # * SWANURL set to my regular location at TUE (instead of U of Waterloo) # * backpath computed correctly when analyzing all resources (incl. invisible) # * backpath computed correctly for root directory of subweb (no longer hangs) # * N.B. hyperlink to root dir (w/o mentioning index.html) still not flagged # * escaped ` within "..." to make error messages come out OK (bash now works) # # New in version 1.1b (18 January 1996): # * hyperlinks from individual cross reference documents go via root of subweb # * numeric fragment names treated as strings, e.g. 1.1 differs from 1.10 # * referenced fragment names in unavailable documents are reported as undefined # PROG=`basename $0` DATE=`date` INVENTORY='.SWAN' SWANHTML='..SWAN.html' SWANURL='http://www.win.tue.nl/~wstomv/swan/swan.html' TMP='/tmp' TMPINV="$TMP/SWAN$$.inv" TMPHTML="$TMP/SWAN$$.html" TMPSH="$TMP/SWAN$$.sh" # help() { echo "SWAN: SubWeb Analyzer, Version $VERSION" echo "Copyright: (C) 1995, Tom Verhoeff" echo "Usage: swan [ options ] [ directory ]" echo "Default: Write report for subweb of current directory to stdout" echo " -a analyze all resources and hyperlinks, including invisible ones" echo " -i read inventory from $INVENTORY" echo " -h show this help; no further processing is done" echo " -o write inventory to $INVENTORY" echo " -q do not write anything to standard output (quiet mode)" echo " +/-r force world-read permission on/off for cross references" echo " -s report only statistics on stdout" echo " -x create individual cross references" echo " -X create overall cross reference" exit 0 } error() { echo "$PROG: $*" exit 1 } main() { # # The main function, invoked at very end # # set default options NOTALL='true' REPORT='true' STATISTICS='true' POSTPROC=':' DIR='.' # parse command line while test $# -ne 0 do case "$1" in -a) NOTALL="" ;; -h) help ;; -i) INPUT="$INVENTORY" ;; -o) OUTPUT="$INVENTORY" ;; -q) QUIET="$1" REPORT="" STATISTICS="" ;; +r|-r) POSTPROC="chmod o$1" ;; -s) REPORT="" ;; -x) INDIVXREF="$TMPSH" ;; -X) TOTALXREF="$TMPHTML" ;; +*|-*) error "invalid option \`$1'; \`swan -h' provides help" ;; *) if test -d $1 then DIR="$1" else error "\`$1' does not exist or is not a directory" fi ;; esac shift done # cd $DIR ROOT=`pwd` if test "$INPUT" then if test ! -f "$INPUT" then error "inventory $INPUT does not exist" else if test "`check_inv $INPUT`" then error "inventory $INPUT has bad format" fi fi fi if test "$INPUT" -a "$OUTPUT" then echo "$PROG: ignoring option -o because of -i" OUTPUT="" fi trap 'rm -f $TMPINV $TMPHTML $TMPSH; exit 1' 1 2 15 if test "$INPUT" then cat $INPUT else frontend fi | if test "$OUTPUT" then tee $TMPINV | awk_them else awk_them fi trap '' 1 2 15 if test "$OUTPUT" then cp $TMPINV $OUTPUT rm -f $TMPINV fi if test "$TOTALXREF" then cp $TMPHTML $SWANHTML $POSTPROC $SWANHTML rm -f $TMPHTML fi if test "$INDIVXREF" then sh <$TMPSH rm -f $TMPSH fi } frontend() { grep_them | # tee grep.out | sed_them | # tee sed.out | sort_them } check_inv() { # # check format of inventory # awk -F# 'NF<10 { printf "bad format" } { exit }' $1 } grep_them() { # # Supply (implicit) definitions for all (visible) resources in sub-web. # Select lines with (explicit) definitions and hyperlinks (`' \; \ \( -name '*.html' -o -name '*.htm' \) \ -exec egrep -n -i '<(A|IMG|BODY|FRAME) ' {} /dev/null \; else find . \ -type d ! -name '.' ! -perm -5 -prune -o \ -type f -perm -4 \ -exec echo -n {} \; -exec echo '::' \; \ \( -name '*.html' -o -name '*.htm' -o -name '.*.html' -o -name '.*.htm' \) \ -exec egrep -n -i '<(A|IMG|BODY|FRAME) ' {} /dev/null \; fi } sed_them() { # # Prepare lines for sorting. Output has lines of the form # # kind#dirpath#file#lnr#scheme#netloc#path#params#query#frag # # where # # kind = "d" (definition) or "r" (reference/hyperlink) # source, path = filepath (if relative then w.r.t. current dir) # scheme = protocol for URL (internal file if empty) # # Note: Space characters cannot be used to delimit fields, because # fields may be empty. We have chosen colons (:) in first phase # because that is what grep produces (source:linenumber:line). # In the second phase we switch to `#' because it is safer. # # The first round of substitutions yields a line of the form # # kind#sourcepath#linenumber#url # # where url has an explicit (possibly empty) fragment part (trailing#frag) # sed ' # First, isolate definitions and hyperlinks (possibly multiple per line): # Substitute $1:$2:$3<]*\)[Nn][Aa][Mm][Ee][ ]*=[ ]*"\{0,1\}\([^" ><]*\)"\{0,1\}/\ d#\1#\2##\5\ \1:\2:\3\4 / s/\n\([.][^:]*\):\([^:]*\):\(.*<[Aa] \)\([^><]*\)[Hh][Rr][Ee][Ff][ ]*=[ ]*"\{0,1\}\([^#"><]*\)#\{0,1\}\([^" ><]*\)"\{0,1\}/\ r#\1#\2#\5#\6\ \1:\2:\3\4 / s/\n\([.][^:]*\):\([^:]*\):\(.*<[Ii][Mm][Gg] \)\([^><]*\)[Ss][Rr][Cc][ ]*=[ ]*"\{0,1\}\([^" ><]*\)"\{0,1\}/\ r#\1#\2#\5#\ \1:\2:\3\4 / s/\n\([.][^:]*\):\([^:]*\):\(.*<[Bb][Oo][Dd][Yy] \)\([^><]*\)[Bb][Aa][Cc][Kk][Gg][Rr][Oo][Uu][Nn][Dd][ ]*=[ ]*"\{0,1\}\([^" ><]*\)"\{0,1\}/\ r#\1#\2#\5#\ \1:\2:\3\4 / s/\n\([.][^:]*\):\([^:]*\):\(.*<[Ff][Rr][Aa][Mm][Ee] \)\([^><]*\)[Ss][Rr][Cc][ ]*=[ ]*"\{0,1\}\([^" ><]*\)"\{0,1\}/\ r#\1#\2#\5#\ \1:\2:\3\4 / t repeat1 s/\n[.].*$// # N.B. The pattern space now may contain embedded newlines! Using s/.../.../g # Second, split the destination RELURL (see RFC 1808), frag is already off: # Prepare by splitting the source path and introducing empty scheme and netloc # Substitute $1#$2/$3#$4#$5 by $1#$2/#$3#$4###$5 # The following only operates from fourth # on to the right. # Substitute ##$2:$3#$4 (where $2 proper) by $2##$3#$4 (split off scheme) # The following only operates from fifth # on to the right. # Substitute #//$2/$3#$4 by $2#/$3#$4 (split off netloc) # The following three only operate from sixth # on to the right. # Substitute $2[?$3]#$4 by $2#$3#$4 (split off query) # Substitute $2[;$3]#$4#$5 by $2#$3#$4#$5 (split off params) s|\(\n.#[^#]*/\)\([^/#]*#[^#]*#\)|\1#\2##|g s|\(\n.#[^#]*#[^#]*#[^#]*#\)##\([A-Za-z0-9+.-]*\):|\1\2##|g s|\(\n.#[^#]*#[^#]*#[^#]*#[^#]*#\)#//\([^/#]*\)|\1\2#|g s|\(\n.#[^#]*#[^#]*#[^#]*#[^#]*#[^#]*#[^?#]*\)?\{0,1\}|\1#|g s|\(\n.#[^#]*#[^#]*#[^#]*#[^#]*#[^#]*#[^;#]*\);\{0,1\}|\1#|g # Third, resolve RELURLs # Substitute $1#$2#$3#$4#### by $1#$2#$3#$4###$3# (no scheme, netloc, path) # Substitute $1#$2#$3#$4###$5 where $5 does not start with / # by $1#$2#$3#$4###$2$5 (no scheme and netloc, but with rel. path) s|\(\n.#[^#]*#\([^#]*\)#[^#]*###\)#|\1\2#|g s|\(\n.#\([^#]*\)#[^#]*#[^#]*###\)\([^/]\)|\1\2\3|g # Fourth, remove ./ and trailing . and xxx/../ and trailing xxx/.. where xxx!=.. t rep4a :rep4a s|\([#/]\)[.]/|\1|g t rep4a s|\([#/]\)[.]#|\1#|g t rep4b :rep4b s|\([#/]\)[^.#][^/#]*/[.][.]/|\1|g s|\([#/]\)[.][^./#][^/#]*/[.][.]/|\1|g s|\([#/]\)[.][.][^/#]\{1,\}[^/#]*/[.][.]/|\1|g t rep4b s|\([/#]\)[^.#][^/#]*/[.][.]#|\1#|g s|\([/#]\)[.][^./#][^/#]*/[.][.]#|\1#|g s|\([/#]\)[.][.][^/#]\{1,\}[^/#]*/[.][.]#|\1#|g # Cleanup by removing initial \n, or whole line if only one left t nop :nop s/^\n// t d ' } sort_them() { # # Sort definitions and hyperlinks on keys # scheme#netloc#path#params#query#frag, kind#dirpath#file, line(numeric) # # kind#dirpath#file#lnr#scheme#netloc#path#params#query#frag # 0 1 2 3 4 5 6 7 8 9 # sort -t# +4 +0 -3 +3 -4n } awk_them() { # backend # # Generate report and create cross references, both overall and individual # # kind#dirpath#file#lnr#scheme#netloc#path#params#query#frag # $1 $2 $3 $4 $5 $6 $7 $8 $9 $10 # awk -F# ' BEGIN { # define some global constants # get environment variables ver="'"$VERSION"'"; date="'"$DATE"'"; dir="'"$ROOT"'/"; notall="'"$NOTALL"'"; quiet="'"$QUIET"'"; report="'"$REPORT"'"; statistics="'"$STATISTICS"'"; totalxref="'"$TOTALXREF"'"; indivxref="'"$INDIVXREF"'"; # some format strings htmlstart="\n\n\n" output, backpath >indivxref; P4=""; Psrc=""; } $1=="d" && url==Purl && (""$10)==P10 { # duplicate definition, assert P1=="d" # (prepending "" in pattern above forces string compare) ++duplicates; ++error; if (Psrc=="") { if (report) printf "%s#%s DUPLICATE DEFINITION", url, $10; output="
\n" output "\n" >totalxref; printf "