#!/bin/sh

# Locate invalid characters in HTML mark-up.

# /www/public_html/admin/janitor/charset.sh

IN_FILE=$1
CHARSET_TEXT=/www/public_html/admin/janitor/charset.txt
CHARSET_LIST=/www/public_html/admin/janitor/charset.lst

# Updated once a day by root's cron job.
HTML_LIST=/tmp/htm__www_public_html__html.lst

if [ ! -f $CHARSET_TEXT ]; then
    echo "File not found: $CHARSET_TEXT";
    exit;
fi
if [ ! -f $HTML_LIST ]; then
    echo "File not found: $HTML_LIST";
    exit;
fi

# Use this .lst (list) file to locate invalid characters in mark-up:
#if [  $HTML_LIST -ot $CHARSET_LIST ]; then
if [ $CHARSET_LIST -ot $HTML_LIST ]; then
    od -t a $CHARSET_TEXT | fmt -w 3 | grep -v [0-9][0-9] | sort | uniq >$CHARSET_LIST
fi


# Does input file exist?
if [ ! -f $IN_FILE ]; then
    echo "File not found: $IN_FILE";
    exit;
fi

# Locate invalid characters.
od -t a $IN_FILE | \
awk ' \
BEGIN { \
} \
NR==1 { \
    while ( (getline line  0) { CHARSET[line]++; } \
} \
{ \
    for (i=(1+1); i<=NF; i++) { \
	char = $(i); \
	if (! (char in CHARSET)) { \
	    i = NF+1; \
	    print IN_FILE ": " $0; \
	} \
    } \
} \
END { \
}' CHARSET_LIST=$CHARSET_LIST IN_FILE=$IN_FILE


# List of invalid characters:
# fmt -w 1 /tmp/htm__www_public_html__html.invalid_chars | grep -v ^/www | grep -v ^[0-9][0-9] | sort | uniq | join -j 1 -v 1 - charset.lst

###
#