Beyond that, though, you have to start worrying where you can remove
whitespace without changing document semantics.
It is very simple. Basically any string of whitespace can be collapsed
to a single whitespace except in <pre> and inside " ..."
See
http://mindprod.com/projects/htmlcompactor.html for details.
Compaction is very fast, done with a simple state machine.
here's the core of it.
package com.mindprod.compactor;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.util.Iterator;
import com.mindprod.filter.AllDirectoriesFilter;
import com.mindprod.filter.ClamFilter;
import com.mindprod.filter.CommandLine;
import com.mindprod.hunkio.HunkIO;
/**
* Compacts HTML, possibly java source or other text
*
* @author Roedy Green
* @version 1.0
*/
public class Compactor
{
/**
* Constructor
*/
public Compactor ( )
{
}
/**
* true collapse multiple spaces to one.
*/
public boolean compactWhitespace = true;
/**
* collapse whitespace even inside comments.
* false not implemented.
*/
public static final boolean compactWhitespaceInComments = true;
/**
* convert html tags to lower case.
* true not implemented.
*/
public static final boolean lowerCaseTags = false;
/**
* maximum allowable blank lines. usually 0 or 1.
*/
public int maxAllowableBlankLines = 0;
/**
* true use Unix \n line terminator,
* false use Windows \r\n or other platform-specific terminator
*/
public boolean oneCharLf = true;
/**
* true strip out all comments except SSI and macros,
* true not implemented
*/
public static final boolean removeComments = false;
/**
* true remove leading space from lines.
*/
public boolean removeLeadSpaces = true;
/**
* true remove trailing space from lines.
* false will never be implemented.
*/
public static final boolean removeTrailingSpaces = true;
/**
* true removes what htmlmacros generate.
* Would normally be false if you are going
* to send this to the web.
* true not implemented.
*/
public static final boolean removeMacroGenerations = false;
/**
* true remove macros.
* This hides how you generated your HTML from
* the outside world.
* The catch is, you can never regenerate your macros again.
* If true, then removeMacroGenerations should be
* false.
* true not implemented.
*/
public static final boolean removeMacros = false;
/**
* Remove unnecessary space on either side of tags.
* It depends on the tag and the amount
* of space on the other side of the tag
* whether space can be completely removed.
* true not implemented.
*/
public static final boolean removeSpaceAroundTags = false;
/**
* Consolidate tags. e.g <span class="x">this
* </span><span class="x">and that</span>
* can be collapsed to <span
* class="x">this and that</span>.
* true not implemented.
*/
public static final boolean consolidateTags = false;
/**
* true convert to CBF, compact binary format.
* The catch here is web browsers can't read this
* without a plugin. This is the main compaction.
* true not implemented.
*/
public static final boolean tokenise = false;
/**
* true LZW compression.
* the catch is, browsers can't read this without
* a special plugin.
* true not implemented
*/
public static final boolean zip = false;
/**
* StringBuffer to accumulate the result file
* character by character.
*/
protected StringBuffer sb;
/**
* how many spaces we have outstanding we have not
* yet put in the StringBuffer.
*/
protected int pendingSpaces;
/**
* HowMany newLines we have outstanding we have not
* put in the StringBuffer.
*/
protected int pendingNewLines;
/**
* true if inside <pre>...</pre> where spaces preserved
*/
protected boolean inPre;
/**
* put the pending newlines and spaces into
* the StringBuffer.
*/
protected void emitPending()
{
// adjust pending newlines, but leave <pre> completely alone.
if ( pendingNewLines > maxAllowableBlankLines+1 && !inPre )
{
pendingNewLines = maxAllowableBlankLines+1;
}
// adjust pending spaces
if ( pendingNewLines > 0 )
{
if ( removeLeadSpaces && !inPre )
{
pendingSpaces = 0;
}
}
else if ( compactWhitespace && pendingSpaces > 0 && !inPre )
{
pendingSpaces = 1;
}
// emit pending newLines
for ( int i=0; i<pendingNewLines; i++ )
{
if ( oneCharLf )
{
sb.append ( '\n' );
}
else
{
sb.append ( lineSeparator );
}
}
pendingNewLines = 0;
// emit pending spaces
for ( int i=0; i<pendingSpaces; i++ )
{
sb.append( ' ' );
}
pendingSpaces = 0;
}
/**
* platform specific line separator char
*/
private static String lineSeparator = System.getProperty (
"line.separator" );
/**
* compact and tidy one file.
*
* @param fileBeingProcessed
* File to compact and tidy.
* @param quiet true if want progress messages suppressed
*
* @exception IOException
*/
public void compactFile( File fileBeingProcessed, boolean quiet )
throws IOException
{
if ( ! quiet )
{
System.out.print(" compacting " +
fileBeingProcessed.getName()+ " " );
}
if ( ! (fileBeingProcessed.getName().endsWith(".html") ||
fileBeingProcessed.getName().endsWith(".htm")) )
{
System.out.println( "Cannot compact: " +
fileBeingProcessed.getName() + "not .html file");
return;
}
String big = HunkIO.readEntireFile( fileBeingProcessed );
String result = compactString( big );
if ( result.equals( big ) )
{
// nothing changed. No need to write results.
if ( ! quiet )
{
System.out.println( "-" );
}
return;
}
// generate output into a temporary file until we are sure all
is ok.
// create a temp file in the same directory as filename
if ( ! quiet )
{
System.out.println( "*" );
}
File tempfile = HunkIO.createTempFile ("temp", ".tmp",
fileBeingProcessed );
FileWriter emit = new FileWriter( tempfile );
emit.write( result );
emit.close();
// successfully created output in same directory as input,
// Now make it replace the input file.
fileBeingProcessed.delete();
tempfile.renameTo( fileBeingProcessed );
} // end processFile
/**
* compact the string by removing whitespace.
*
* @param big Fluffy string you want compacted.
* @return compacted string.
*/
public String compactString ( String big )
{
int originalLength = big.length();
sb = new StringBuffer( originalLength );
pendingSpaces = 0;
pendingNewLines = 0;
inPre = false;
// loop through each char categorising it
// deal with
// removing lead spaces.
// removing trail spaces.
// collapsing excess whitespace
for ( int i=0; i<originalLength; i++ )
{
char c = big.charAt( i );
switch ( c )
{
default:
// deal with pending newlines and spaces first
if ( pendingSpaces > 0 || pendingNewLines > 0 )
{
emitPending();
}
// now emit the normal character.
sb.append( c );
break;
case '\r':
/* should we ignore this \r ? */
/* We do if it was immediately followed by a \n */
if ( !( i+1 < originalLength && big.charAt( i+1 ) ==
'\n' ) )
{
// it was a standalone Mac style \r, treat like \n
// always ignore trailing spaces, even when inPre =
true.
pendingSpaces = 0;
pendingNewLines++;
}
// otherwise ignore it
break;
case '\n':
// always ignore trailing spaces, even when inPre =
true.
pendingSpaces = 0;
pendingNewLines++;
break;
case '<':
// deal with pending newlines and spaces first
if ( pendingSpaces > 0 || pendingNewLines > 0 )
{
emitPending();
}
if ( inPre )
{
if ( ( i + 6 <= originalLength ) && big.substring(
i, i+6 ).equalsIgnoreCase("</pre>") )
{
inPre = false;
}
}
else
{
if ( ( i + 5 <= originalLength ) && big.substring(
i, i+5 ).equalsIgnoreCase("<pre>") )
{
inPre = true;
}
}
// now emit the normal character.
sb.append( '<' );
break;
case 0:
case 1:
case 2:
case 3:
case 4:
case 5:
case 6:
case 7:
case 8:
case '\t': // 9 tab
// case 10: lf
case 11:
case 12:
// case 13: cr
case 14:
case 15:
case 16:
case 17:
case 18:
case 19:
case 20:
case 21:
case 22:
case 23:
case 24:
case 25:
case 26:
case 27:
case 28:
case 29:
case 30:
case 31:
case ' ': // 32: space
pendingSpaces++;
} // end switch
} // end for
// allow file to end with out a final newline, but no more than
one.
pendingSpaces = 0;
if ( pendingNewLines > 1 )
{
pendingNewLines = 1;
}
emitPending();
return sb.toString();
} // end processString
/**
* compacts HTML files.
*
* @param args names of files to process, dirs, files, -s, *.*,
no wildards.
*/
public static void main ( String[] args )
{
// gather all the files mentioned on the command line.
// either directories, files, *.*, with -s and subdirs option.
System.out.println( "Gathering files to process..." );
Iterator wantedFiles =
CommandLine.getFilesToProcess(
args, /* what is on the command
line */
1000, /* estimate of expected files
*/
new AllDirectoriesFilter(),
new ClamFilter( "", ".html" )
);
Compactor c = new Compactor();
for ( Iterator iter=wantedFiles; iter.hasNext(); )
{
File file = (File)iter.next();
try
{
c.compactFile( file , false /* not quiet */ );
}
catch ( FileNotFoundException e )
{
System.out.println( "Error: " + file.getAbsolutePath() +
" not found." );
}
catch ( Exception e )
{
System.out.println( e.getMessage() + " in file " +
file.getAbsolutePath() );
System.out.println();
e.printStackTrace();
}
} // end for
} // end main
} // end Compactor