#!/bin/sh # Locate invalid characters in HTML mark-up. # /www/public_html/admin/janitor/charset.sh IN_FILE=$1 CHARSET_TEXT=/www/public_html/admin/janitor/charset.txt CHARSET_LIST=/www/public_html/admin/janitor/charset.lst # Updated once a day by root's cron job. HTML_LIST=/tmp/htm__www_public_html__html.lst if [ ! -f $CHARSET_TEXT ]; then echo "File not found: $CHARSET_TEXT"; exit; fi if [ ! -f $HTML_LIST ]; then echo "File not found: $HTML_LIST"; exit; fi # Use this .lst (list) file to locate invalid characters in mark-up: #if [ $HTML_LIST -ot $CHARSET_LIST ]; then if [ $CHARSET_LIST -ot $HTML_LIST ]; then od -t a $CHARSET_TEXT | fmt -w 3 | grep -v [0-9][0-9] | sort | uniq >$CHARSET_LIST fi # Does input file exist? if [ ! -f $IN_FILE ]; then echo "File not found: $IN_FILE"; exit; fi # Locate invalid characters. od -t a $IN_FILE | \ awk ' \ BEGIN { \ } \ NR==1 { \ while ( (getline line 0) { CHARSET[line]++; } \ } \ { \ for (i=(1+1); i<=NF; i++) { \ char = $(i); \ if (! (char in CHARSET)) { \ i = NF+1; \ print IN_FILE ": " $0; \ } \ } \ } \ END { \ }' CHARSET_LIST=$CHARSET_LIST IN_FILE=$IN_FILE # List of invalid characters: # fmt -w 1 /tmp/htm__www_public_html__html.invalid_chars | grep -v ^/www | grep -v ^[0-9][0-9] | sort | uniq | join -j 1 -v 1 - charset.lst ### #