#! /bin/sh # # swan -- sub-web analyzer # # Copyright (c) 1995, Tom Verhoeff # VERSION='1.3b' # # New in version 1.3b (16 February 1998): # * HTML files named *.htm are included as well. # * Tags and are recognized. # * Hyperlinks to external resources are grouped by scheme,netloc. # * Minor cosmetic changes in the HTML output (the cross references). # # New in version 1.2b (16 July 1996): # * SWANURL set to my regular location at TUE (instead of U of Waterloo) # * backpath computed correctly when analyzing all resources (incl. invisible) # * backpath computed correctly for root directory of subweb (no longer hangs) # * N.B. hyperlink to root dir (w/o mentioning index.html) still not flagged # * escaped ` within "..." to make error messages come out OK (bash now works) # # New in version 1.1b (18 January 1996): # * hyperlinks from individual cross reference documents go via root of subweb # * numeric fragment names treated as strings, e.g. 1.1 differs from 1.10 # * referenced fragment names in unavailable documents are reported as undefined # PROG=`basename $0` DATE=`date` INVENTORY='.SWAN' SWANHTML='..SWAN.html' SWANURL='http://www.win.tue.nl/~wstomv/swan/swan.html' TMP='/tmp' TMPINV="$TMP/SWAN$$.inv" TMPHTML="$TMP/SWAN$$.html" TMPSH="$TMP/SWAN$$.sh" # help() { echo "SWAN: SubWeb Analyzer, Version $VERSION" echo "Copyright: (C) 1995, Tom Verhoeff" echo "Usage: swan [ options ] [ directory ]" echo "Default: Write report for subweb of current directory to stdout" echo " -a analyze all resources and hyperlinks, including invisible ones" echo " -i read inventory from $INVENTORY" echo " -h show this help; no further processing is done" echo " -o write inventory to $INVENTORY" echo " -q do not write anything to standard output (quiet mode)" echo " +/-r force world-read permission on/off for cross references" echo " -s report only statistics on stdout" echo " -x create individual cross references" echo " -X create overall cross reference" exit 0 } error() { echo "$PROG: $*" exit 1 } main() { # # The main function, invoked at very end # # set default options NOTALL='true' REPORT='true' STATISTICS='true' POSTPROC=':' DIR='.' # parse command line while test $# -ne 0 do case "$1" in -a) NOTALL="" ;; -h) help ;; -i) INPUT="$INVENTORY" ;; -o) OUTPUT="$INVENTORY" ;; -q) QUIET="$1" REPORT="" STATISTICS="" ;; +r|-r) POSTPROC="chmod o$1" ;; -s) REPORT="" ;; -x) INDIVXREF="$TMPSH" ;; -X) TOTALXREF="$TMPHTML" ;; +*|-*) error "invalid option \`$1'; \`swan -h' provides help" ;; *) if test -d $1 then DIR="$1" else error "\`$1' does not exist or is not a directory" fi ;; esac shift done # cd $DIR ROOT=`pwd` if test "$INPUT" then if test ! -f "$INPUT" then error "inventory $INPUT does not exist" else if test "`check_inv $INPUT`" then error "inventory $INPUT has bad format" fi fi fi if test "$INPUT" -a "$OUTPUT" then echo "$PROG: ignoring option -o because of -i" OUTPUT="" fi trap 'rm -f $TMPINV $TMPHTML $TMPSH; exit 1' 1 2 15 if test "$INPUT" then cat $INPUT else frontend fi | if test "$OUTPUT" then tee $TMPINV | awk_them else awk_them fi trap '' 1 2 15 if test "$OUTPUT" then cp $TMPINV $OUTPUT rm -f $TMPINV fi if test "$TOTALXREF" then cp $TMPHTML $SWANHTML $POSTPROC $SWANHTML rm -f $TMPHTML fi if test "$INDIVXREF" then sh <$TMPSH rm -f $TMPSH fi } frontend() { grep_them | # tee grep.out | sed_them | # tee sed.out | sort_them } check_inv() { # # check format of inventory # awk -F# 'NF<10 { printf "bad format" } { exit }' $1 } grep_them() { # # Supply (implicit) definitions for all (visible) resources in sub-web. # Select lines with (explicit) definitions and hyperlinks (`' \; \ \( -name '*.html' -o -name '*.htm' \) \ -exec egrep -n -i '<(A|IMG|BODY|FRAME) ' {} /dev/null \; else find . \ -type d ! -name '.' ! -perm -5 -prune -o \ -type f -perm -4 \ -exec echo -n {} \; -exec echo '::' \; \ \( -name '*.html' -o -name '*.htm' -o -name '.*.html' -o -name '.*.htm' \) \ -exec egrep -n -i '<(A|IMG|BODY|FRAME) ' {} /dev/null \; fi } sed_them() { # # Prepare lines for sorting. Output has lines of the form # # kind#dirpath#file#lnr#scheme#netloc#path#params#query#frag # # where # # kind = "d" (definition) or "r" (reference/hyperlink) # source, path = filepath (if relative then w.r.t. current dir) # scheme = protocol for URL (internal file if empty) # # Note: Space characters cannot be used to delimit fields, because # fields may be empty. We have chosen colons (:) in first phase # because that is what grep produces (source:linenumber:line). # In the second phase we switch to `#' because it is safer. # # The first round of substitutions yields a line of the form # # kind#sourcepath#linenumber#url # # where url has an explicit (possibly empty) fragment part (trailing#frag) # sed ' # First, isolate definitions and hyperlinks (possibly multiple per line): # Substitute $1:$2:$3<]*\)[Nn][Aa][Mm][Ee][ ]*=[ ]*"\{0,1\}\([^" ><]*\)"\{0,1\}/\ d#\1#\2##\5\ \1:\2:\3\4 / s/\n\([.][^:]*\):\([^:]*\):\(.*<[Aa] \)\([^><]*\)[Hh][Rr][Ee][Ff][ ]*=[ ]*"\{0,1\}\([^#"><]*\)#\{0,1\}\([^" ><]*\)"\{0,1\}/\ r#\1#\2#\5#\6\ \1:\2:\3\4 / s/\n\([.][^:]*\):\([^:]*\):\(.*<[Ii][Mm][Gg] \)\([^><]*\)[Ss][Rr][Cc][ ]*=[ ]*"\{0,1\}\([^" ><]*\)"\{0,1\}/\ r#\1#\2#\5#\ \1:\2:\3\4 / s/\n\([.][^:]*\):\([^:]*\):\(.*<[Bb][Oo][Dd][Yy] \)\([^><]*\)[Bb][Aa][Cc][Kk][Gg][Rr][Oo][Uu][Nn][Dd][ ]*=[ ]*"\{0,1\}\([^" ><]*\)"\{0,1\}/\ r#\1#\2#\5#\ \1:\2:\3\4 / s/\n\([.][^:]*\):\([^:]*\):\(.*<[Ff][Rr][Aa][Mm][Ee] \)\([^><]*\)[Ss][Rr][Cc][ ]*=[ ]*"\{0,1\}\([^" ><]*\)"\{0,1\}/\ r#\1#\2#\5#\ \1:\2:\3\4 / t repeat1 s/\n[.].*$// # N.B. The pattern space now may contain embedded newlines! Using s/.../.../g # Second, split the destination RELURL (see RFC 1808), frag is already off: # Prepare by splitting the source path and introducing empty scheme and netloc # Substitute $1#$2/$3#$4#$5 by $1#$2/#$3#$4###$5 # The following only operates from fourth # on to the right. # Substitute ##$2:$3#$4 (where $2 proper) by $2##$3#$4 (split off scheme) # The following only operates from fifth # on to the right. # Substitute #//$2/$3#$4 by $2#/$3#$4 (split off netloc) # The following three only operate from sixth # on to the right. # Substitute $2[?$3]#$4 by $2#$3#$4 (split off query) # Substitute $2[;$3]#$4#$5 by $2#$3#$4#$5 (split off params) s|\(\n.#[^#]*/\)\([^/#]*#[^#]*#\)|\1#\2##|g s|\(\n.#[^#]*#[^#]*#[^#]*#\)##\([A-Za-z0-9+.-]*\):|\1\2##|g s|\(\n.#[^#]*#[^#]*#[^#]*#[^#]*#\)#//\([^/#]*\)|\1\2#|g s|\(\n.#[^#]*#[^#]*#[^#]*#[^#]*#[^#]*#[^?#]*\)?\{0,1\}|\1#|g s|\(\n.#[^#]*#[^#]*#[^#]*#[^#]*#[^#]*#[^;#]*\);\{0,1\}|\1#|g # Third, resolve RELURLs # Substitute $1#$2#$3#$4#### by $1#$2#$3#$4###$3# (no scheme, netloc, path) # Substitute $1#$2#$3#$4###$5 where $5 does not start with / # by $1#$2#$3#$4###$2$5 (no scheme and netloc, but with rel. path) s|\(\n.#[^#]*#\([^#]*\)#[^#]*###\)#|\1\2#|g s|\(\n.#\([^#]*\)#[^#]*#[^#]*###\)\([^/]\)|\1\2\3|g # Fourth, remove ./ and trailing . and xxx/../ and trailing xxx/.. where xxx!=.. t rep4a :rep4a s|\([#/]\)[.]/|\1|g t rep4a s|\([#/]\)[.]#|\1#|g t rep4b :rep4b s|\([#/]\)[^.#][^/#]*/[.][.]/|\1|g s|\([#/]\)[.][^./#][^/#]*/[.][.]/|\1|g s|\([#/]\)[.][.][^/#]\{1,\}[^/#]*/[.][.]/|\1|g t rep4b s|\([/#]\)[^.#][^/#]*/[.][.]#|\1#|g s|\([/#]\)[.][^./#][^/#]*/[.][.]#|\1#|g s|\([/#]\)[.][.][^/#]\{1,\}[^/#]*/[.][.]#|\1#|g # Cleanup by removing initial \n, or whole line if only one left t nop :nop s/^\n// t d ' } sort_them() { # # Sort definitions and hyperlinks on keys # scheme#netloc#path#params#query#frag, kind#dirpath#file, line(numeric) # # kind#dirpath#file#lnr#scheme#netloc#path#params#query#frag # 0 1 2 3 4 5 6 7 8 9 # sort -t# +4 +0 -3 +3 -4n } awk_them() { # backend # # Generate report and create cross references, both overall and individual # # kind#dirpath#file#lnr#scheme#netloc#path#params#query#frag # $1 $2 $3 $4 $5 $6 $7 $8 $9 $10 # awk -F# ' BEGIN { # define some global constants # get environment variables ver="'"$VERSION"'"; date="'"$DATE"'"; dir="'"$ROOT"'/"; notall="'"$NOTALL"'"; quiet="'"$QUIET"'"; report="'"$REPORT"'"; statistics="'"$STATISTICS"'"; totalxref="'"$TOTALXREF"'"; indivxref="'"$INDIVXREF"'"; # some format strings htmlstart="\n\n%s\n\n\n

%s

\n"; link="
%s"; fragdef="%s"; strong="%s"; htmlend="\n\n"; shopen="cat - >%s <\n"; exthdr="

" extdef "\n[ " intlink " |\n" statlink " ]

\n"; stathdr="

" statdef "\n[ " intlink " |\n" extlink " ]

\n"; blurb="\nGenerated by\n" swanlink "\nversion " ver "\non " date ".\n"; # initial output if (!quiet) printf "SubWeb Analysis of %s\n-----\n", dir; if (totalxref) printf htmlstart blurb explain inthdr, title dir, title dir >totalxref; } { # always if (NF<10) { # inventory has wrong format printf "Inventory has wrong format on line %d (skipped).\n", NR; next; } src=$2 $3; linenr=$4; if ($5 || $6 || substr($7, 1, 1)=="/") internal=0; else { internal=1; nseg=split($7, seg, "/"); # split path into segments backpath=""; # construct path from resource back to root of subweb for (i=1;itotalxref; if (indivfile) printf output >indivxref; if ($1!=P1) error=0; Psrc=""; } url!=Purl { # new target URL if (Purl) { # finish preceding url if (refs) { # count number of referenced resources if (Pinternal) ++intresources; else ++extresources; } else { # not referenced ++unreferenced; if (report) printf "%s NOT REFERENCED\n", Purl; output=" " sprintf(strong, "NOT REFERENCED") "\n"; if (totalxref) printf output >totalxref; if (indivfile) printf output >indivxref; } if (internal || $5!=P5 || $6!=P6) { output="\n"; if (totalxref) printf output >totalxref; output=output "
" blurb htmlend sprintf(shclose, indivfile); if (indivfile) printf output >indivxref; indivfile=""; } } # -------------------------------------- refs=0; # number of hyperlinks to this url error=0; # no errors (yet) with this url if (Pinternal && !internal) { # first external resource if (totalxref) printf "
\n" exthdr >totalxref; } if (!internal && $5==P5 && $6==P6) output=""; else output="
\n
\n"; output=output "
" sprintf(strong, sprintf(link, "%s" url, url)) "\n"; if ($1=="d") { # defined resource, hence internal ++resources; # count number of internal resources if (substr($7, length($7)-4)==".html") html=1; else if (substr($7, length($7)-3)==".htm") html=1; else html=0; if (html) { # html document ++htmldocuments; if (indivxref) { # create individual cross reference indivfile=$2 "." $3; # assert $7==src printf shopen, indivfile >indivxref; printf htmlstart , title $7, title $7 >indivxref; printf explain >indivxref; } } } else if (internal) { # internal hyperlink to unavailable resource ++unavailables; ++error; if (report) printf "%s NOT AVAILABLE\n", url; output=output " " sprintf(strong, "NOT AVAILABLE") "\n"; } else { # external hyperlink, nothing special to do } if (totalxref) printf output, "" >totalxref; if (indivfile) printf output, backpath >indivxref; P4=""; P10=""; Psrc=""; } (""$10)!=P10 { # new fragment (prepending "" forces string compare) error=0; # no errors (yet) with this url#frag output="
" sprintf(strong, sprintf(link, "%s" url "#" $10, $10)) "\n"; if ($1=="d") { # defined fragment ++fragments; output=output " defined on line " linenr "\n"; linenr=""; # line number printed } else if (internal) { # internal hyperlink to undefined fragment ++undefined; ++error; if (report) printf "%s#%s NOT DEFINED\n", url, $10; output= output " " sprintf(strong, "NOT DEFINED") "\n"; } if (totalxref) printf output, "" >totalxref; if (indivfile) printf "

\n" output, backpath >indivxref; P4=""; Psrc=""; } $1=="d" && url==Purl && (""$10)==P10 { # duplicate definition, assert P1=="d" # (prepending "" in pattern above forces string compare) ++duplicates; ++error; if (Psrc=="") { if (report) printf "%s#%s DUPLICATE DEFINITION", url, $10; output="

" sprintf(strong, "DUPLICATE DEFINITION"); if (totalxref) printf output >totalxref; if (indivfile) printf output >indivxref; } } $1=="r" { # hyperlink/reference if (internal) ++internals; else ++externals; ++refs; if (src!=Psrc) { # new source if (report && error) printf " see " src; output="
" sprintf(link, "%s" src, src); separator=", "; if (totalxref) printf output, "" >totalxref; if (indivfile) printf output, backpath >indivxref; P4=""; } } linenr!="" { # take care of line number if (linenr==P4) output="*"; else output=", " linenr; if (report && error) printf output; if (totalxref) printf output >totalxref; if (indivfile) printf output >indivxref; P4=linenr; Psrc=src; } { # always P1=$1; P5=$5; P6=$6; P10=""$10; Pinternal=internal; Purl=url; } END { if (Psrc) { output="\n"; if (report && error) printf output; if (totalxref) printf output >totalxref; if (indivfile) printf output >indivxref; } if (url) { if (refs) { # count number of referenced resources if (Pinternal) ++intresources; else ++extresources; } else { # not referenced ++unreferenced; if (report) printf "%s NOT REFERENCED\n", Purl; output=" " sprintf(strong, "NOT REFERENCED") "\n"; if (totalxref) printf output >totalxref; if (indivfile) printf output >indivxref; } } output="
\n"; if (totalxref) printf output >totalxref; output=output "
" blurb htmlend sprintf(shclose, indivfile); if (indivfile) printf output >indivxref; indivfile=""; # deal with statistics errors=unavailables+undefined+duplicates+unreferenced; output=\ sprintf("%5d resources available, of which\n", resources)\ sprintf(" %5d HTML documents, providing\n", htmldocuments)\ sprintf(" %5d fragment definitions and\n\n", fragments)\ sprintf("%5d hyperlinks, of which\n", internals+externals)\ sprintf(" %5d internal to %5d resources\n", internals, intresources)\ sprintf(" %5d external to %5d resources\n\n", externals, extresources)\ sprintf("%5d inconsistencies detected\n", errors); if (errors) { output=output\ sprintf(" %5d hyperlinks to unavailable resources\n", unavailables)\ sprintf(" %5d hyperlinks to undefined fragments\n", undefined)\ sprintf(" %5d duplicate fragment definitions\n", duplicates)\ sprintf(" %5d unreferenced resources\n", unreferenced); } if (statistics) { # report statistics if (report && errors) printf "-----\n"; printf output; } if (totalxref) { printf "
\n" stathdr >totalxref; printf "
\n" output "
\n" >totalxref; printf "
" blurb htmlend >totalxref; } }' } # # All work is initiated here # main $* # End of Script