One way to parse links is to use regular expressions, however, HTML links can come in many forms and combinations of its attributes. Which is why it is easier to use the built-in Swing HTMLEditorKit Parser to parse the HTML and extract the links. It works like a SAX parser, each time the parser encountered the start and end (or the content) of a tag, it calls the appropriate method (which you override) to handle the tag. The variable state keeps track of whether we are inside a link or somewhere else. The link, title and text are then stored in the LinkItem object and then added to the links array list. The code can be easily modified to parse different types of HTML tags.

package com.augustli.html;

import java.io.Reader;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;

import javax.swing.text.MutableAttributeSet;
import javax.swing.text.html.HTML;
import javax.swing.text.html.HTMLEditorKit;
import javax.swing.text.html.parser.ParserDelegator;

/**
 * Extract links from HTML
 * @author augustli
 */
public class ExtractLinks {
	
	/**
	 * Container object for links
	 */
	class LinkItem {
		private String link;
		private String title;
		private String text;
		
		public String getTitle() {
			return title;
		}

		public void setTitle(String title) {
			this.title = title;
		}

		public String getLink() {
			return link;
		}

		public void setLink(String link) {
			this.link = link;
		}

		public String getText() {
			return text;
		}

		public void setText(String text) {
			this.text = text;
		}
	}

	/**
	 * Swing HTMLEditorKit Parser
	 */
	class Parser extends HTMLEditorKit.ParserCallback {
		private static final int SIZE = 128;
		private static final int INSIDE_LINK = 0;
		private static final int OUTSIDE_LINK = 1;
		private int state = OUTSIDE_LINK;
		private List<LinkItem> links;
		private String href;
		private String title;
		private StringBuilder text;
		
		public Parser(List<LinkItem> links) {
			this.links = links; //pass in our list 
		}

		@Override
		public void handleStartTag(HTML.Tag tag, MutableAttributeSet attrSet, int pos) {
			if (tag == HTML.Tag.A) {  //check what kind of tag
                                //get the attributes
				href = (String) attrSet.getAttribute(HTML.Attribute.HREF);
				title = (String) attrSet.getAttribute(HTML.Attribute.TITLE);
				text = new StringBuilder(SIZE);
				state = INSIDE_LINK;
			}
		}

		@Override
		public void handleText(char[] data, int pos) {
			if (state == INSIDE_LINK) {
				text.append(data);  
			}
		}

		@Override
		public void handleEndTag(HTML.Tag tag, int pos) {
			if (tag == HTML.Tag.A && state == INSIDE_LINK) {
				if (href != null) { 
					LinkItem itm = new LinkItem();
					itm.setLink(href);
					itm.setTitle(title);
					itm.setText(text.toString());
					links.add(itm);
				}
				state = OUTSIDE_LINK;
			}
		}
	}

	/**
	 * Get list of links from HTML string
	 * @param htmlContent HTML string
	 * @return List<Item>
	 * @throws Exception
	 */
	public List<LinkItem> getLinks(String htmlContent) throws Exception {
		List<LinkItem> links = new ArrayList<LinkItem>();
		Reader reader = new StringReader(htmlContent);
		new ParserDelegator().parse(reader, new Parser(links), true);
		return links;
	}

	/**
	 * Test Program
	 * @param args
	 * @throws Exception
	 */
	public static void main(String... args) throws Exception {
		String html = "<html><body><a id='a' href='a.html' title='Title A'>Test A</a>"+
                "<A TITLE=\"Title B\" HREF=\"b.html\">Test B</A>"+
                "<a class='link' href='c.html'>Test C</a></body></html>";
		ExtractLinks fl = new ExtractLinks();
		for(LinkItem itm : fl.getLinks(html)) {
			System.out.println("Link: " + itm.getLink() +
                        "\ttext: " + itm.text + 
                        "\ttitle: " + itm.getTitle());
		}
	}

}

Advertisement