#! /bin/sed -nf
#
# Get the title of an HTML document.
# Look first for TITLE tag, then fall back on first Heading tag. Otherwise
# produce nothing.
#
# Casper Boden-Cummins <mister_pink@bigfoot.com>, 1998
#

/<[Tt][Ii][Tt][Ll][Ee]/ {
  : title
  # read up to closing tag
  /<\/[Tt][Ii][Tt][Ll][Ee]>/ !{
    N
    b title
  }

  # strip leading/trailing whitespace
  # Mario Niebaum <Mario.Niebaum@e-technik.tu-chemnitz.de>
  s/^.*<[tT][iI][tT][lL][eE]>[        ]*//
  s/[         ]*<\/[tT][iI][tT][lL][eE]>.*$//

  # strip HTML tags and print
  b print
}  

/<[Hh][0-7]/ {
  : heading
  # lowercase heading tags
  s/\(<\/\{0,1\}\)[Hh]\([0-7]\)/\1h\2/g

  # strip characters up to opening tag
  G
  s/<h\([0-7]>\)\(.*\)\(.\)$/\3\1\2/
  s/.*\n/<h/

  : match_tags
  /^<h\([0-7]\).*<\/h\1/ !{
    N
    b match_tags
  }

  # strip leading/trailing whitespace
  s/^.*<[Hh][0-7]>[        ]*//
  s/[         ]*<\/[Hh][0-7]>.*$//

  # clean up and print
  b print
}  

# nothing found, so continue to next line
b 

# print:
# remove embedded tags and leading/trailing whitespace from
# pattern space, print result and quit
: print

# reduce leading/trailing whitespace around newlines
s/[ 	]*\n[	 ]*/ /g

# convert embedded TAGS to whitespace
s/[ 	]*<\/*[a-zA-Z0-9]\{1,\}>[	 ]*/ /g

# remove non-printable characters
s/[^ -;=?-~]//g

# strip leading/trailing whitespace from line
s/^[ 	]*\(.*[^ 	]\)[ 	]*$/\1/

# print and quit
p  
q  

### colorized by sedsed, a sed script debugger/indenter/tokenizer/HTMLizer
### original script: http://pcsiwa.rett.polimi.it/~paolob/seders/scripts/get_html_title.sed