#!/bin/sed -f # Thu May 18 12:43:45 CEST 2000 by tilmann@bitterberg.de # # Description: # Creates an index of links from a HTML file # Does something similar like lynx -force_html -dump but # leaves the document html (generate an index of links) # # Example: Input # # foo1 Click here foo2 # # # Output: # # foo1 [1] Click here foo2 #
[1] http://link.org
# # # NOTE: # 1) Will break at links like foo2\n 1 2 3 4 .. 500 # using newline as separator to the 's' command and 'I' for casei s \(a *href\) *= *\("\([^"]\+\)"[^>]*>\)\([^\n]*\(\n\)\) \([^ ]*\)\(.*$\) \1|||||=\2[\6] \4\7\5[\6] \3
I #|----1----| |----------2---------||-------4------| |---6---||--7--| # |---3----| |--5-| # Field Contains: # \1 a href # \2 the link text up to the closing > # \3 the link itself (http://foo.com) # \4 the rest of the input line # \5 a newline (\n) # \6 the number we would like to use # \7 everything up to the end of patternspace # # Now the line looks like: # foo1
[1] foo2\n 2 3 4 .. 500\n[1] blah.html
t loop; # look if there is another link in that line s/|||||//g; # delete marker h; # save how many numbers are used s/\n.*//; # "restore" the original line x s/[^\n]*\n// x } } # Just before the insert index /<\/[Bb][Oo][Dd][Yy]>/{ x; # insert saved stuff s/[^\n]*\n//; # delete unused numbers s/^/
/ G }