C
chingooo3k
Hi,
I am a newbie to java and html parsing although I have done lex/yacc
compilers before. I am trying to leech any http link from a given file
be it a proper 'http://www.....' or just a reference like
'/somedierctory/..../stuff' .... For now I plan on running quick tests
on the local file references to see if they exist or not on the hard
drive and so I got into Java and regular expressions ....
Can the java gurus here (hehe ok I'm not being picky) please comment on
my code and how I can optimize it ? Please don't just say it 'sucks' (I
know it does) .. give me a 'because' and perhaps some pointers on how
to make it not so sucky
Thanks.
*******************************
*******************************
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
public class InternalLinkChecker
{
private static Pattern pattern;
private static Matcher matcher;
private static String REGEX;
private static BufferedReader in = null;
private static FileWriter out_rep = null;
public static void main (String [] args)
{
try
{
if(args.length != 1)
throw new IllegalArgumentException("Need to let me know which
file.");
else
{
File file = new File(args[0]);
if (file.exists())
{
in = new BufferedReader(new FileReader(file));
StringBuffer buff = new StringBuffer();
int c;
while((c=in.read())!= -1)
buff.append((char) c);
StringBuffer temp2 = new StringBuffer();
String blah;
String [] Split;
int count = 0;
REGEX = "(<a href=)[^\\s]+(\")";
pattern = Pattern.compile(REGEX, Pattern.CASE_INSENSITIVE);
matcher = pattern.matcher(buff);
while(matcher.find())
{
System.out.println("----------------------");
System.out.println("I found: \' " + matcher.group() + "' \n" +
"Range: " + matcher.start() + " to " + matcher.end());
count++;
temp2.append(matcher.group());
}
System.out.println("\n so I found a total of " + count + "
URLS.");
blah = temp2.toString();
blah = blah.replaceAll("(?i)<A HREF=\"","");
Split = blah.split("\"");
out_rep = new FileWriter(new File("Rep.txt"));
for (int i=0; i<Split.length; i++)
out_rep.write(Split + "\n");
}
else
{
throw new IllegalArgumentException("Your file does not exist!");
}
}
}
catch (IOException e)
{
System.err.println(e);
e.printStackTrace();
}
finally
{
try
{
in.close(); out_rep.close();
}
catch (IOException ex)
{
ex.printStackTrace();
System.err.println(ex);
}
}
}
}
*****************************
******************************
I am a newbie to java and html parsing although I have done lex/yacc
compilers before. I am trying to leech any http link from a given file
be it a proper 'http://www.....' or just a reference like
'/somedierctory/..../stuff' .... For now I plan on running quick tests
on the local file references to see if they exist or not on the hard
drive and so I got into Java and regular expressions ....
Can the java gurus here (hehe ok I'm not being picky) please comment on
my code and how I can optimize it ? Please don't just say it 'sucks' (I
know it does) .. give me a 'because' and perhaps some pointers on how
to make it not so sucky
Thanks.
*******************************
*******************************
import java.io.*;
import java.net.*;
import java.util.*;
import java.util.regex.*;
public class InternalLinkChecker
{
private static Pattern pattern;
private static Matcher matcher;
private static String REGEX;
private static BufferedReader in = null;
private static FileWriter out_rep = null;
public static void main (String [] args)
{
try
{
if(args.length != 1)
throw new IllegalArgumentException("Need to let me know which
file.");
else
{
File file = new File(args[0]);
if (file.exists())
{
in = new BufferedReader(new FileReader(file));
StringBuffer buff = new StringBuffer();
int c;
while((c=in.read())!= -1)
buff.append((char) c);
StringBuffer temp2 = new StringBuffer();
String blah;
String [] Split;
int count = 0;
REGEX = "(<a href=)[^\\s]+(\")";
pattern = Pattern.compile(REGEX, Pattern.CASE_INSENSITIVE);
matcher = pattern.matcher(buff);
while(matcher.find())
{
System.out.println("----------------------");
System.out.println("I found: \' " + matcher.group() + "' \n" +
"Range: " + matcher.start() + " to " + matcher.end());
count++;
temp2.append(matcher.group());
}
System.out.println("\n so I found a total of " + count + "
URLS.");
blah = temp2.toString();
blah = blah.replaceAll("(?i)<A HREF=\"","");
Split = blah.split("\"");
out_rep = new FileWriter(new File("Rep.txt"));
for (int i=0; i<Split.length; i++)
out_rep.write(Split + "\n");
}
else
{
throw new IllegalArgumentException("Your file does not exist!");
}
}
}
catch (IOException e)
{
System.err.println(e);
e.printStackTrace();
}
finally
{
try
{
in.close(); out_rep.close();
}
catch (IOException ex)
{
ex.printStackTrace();
System.err.println(ex);
}
}
}
}
*****************************
******************************