Friday, November 27, 2015

Simple Web Crawler using BFS - Java

Making a simple web crawler using BFS in Java.

root     
         - the starting web address
regex 
         - the regular expression pattern to extract web site links from html content downloaded form a web page

HTML content is downloaded using the URL class at java.net.URL

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.Queue;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class WebCrawler {
    //Queue for BFS
    static Queue<String> q = new LinkedList<>();
    
    //URLs already visited
    static Set<String> marked = new HashSet<>();
    
   //URL Pattern regex
   static String regex = "http[s]*://(\\w+\\.)*(\\w+)";
   
   //Start from here
   static String root = "http://www.codebytes.in";
    
   //BFS Routine
    public static void bfs() throws IOException{
        q.add(root);
        while(!q.isEmpty()){ 
            String s = q.poll();
            
            //Find only almost 100 websites.
            if(marked.size()>100)return;
            
            boolean ok = false;
            URL url = null;
            BufferedReader br = null;
            
            while(!ok){ 
                try{
                    url = new URL(s);
                    br = new BufferedReader(new InputStreamReader(url.openStream()));
                    ok = true;
                }catch(MalformedURLException e){
                    System.out.println("\nMalformedURL : "+s+"\n");
                    //Get next URL from queue
                    s = q.poll();
                    ok = false;
                }catch(IOException e){
                    System.out.println("\nIOException for URL : "+s+"\n");
                    //Get next URL from queue
                    s = q.poll();
                    ok = false;
                }
            }         
            
            StringBuilder sb = new StringBuilder();
            
            while((s = br.readLine())!=null){
                sb.append(s);
            }
            s = sb.toString();
            Pattern pattern = Pattern.compile(regex);
            Matcher matcher = pattern.matcher(s);
            
            while(matcher.find()){
                String w = matcher.group(); 
                
                if(!marked.contains(w)){
                    marked.add(w);
                    System.out.println("Site : "+w);
                    q.add(w);
                }
            } 
        }
    }
    
    //Display results from SET marked
    public static void displayResults(){
        System.out.println("\n\nResults: ");
        System.out.println("\nWeb sites crawled : "+marked.size()+"\n");
        for(String s:marked){
            System.out.println(s);
        }
    }
    
    //Run
    public static void main(String[] args){
        try{
            bfs();
            displayResults(); 
        }catch(IOException e){
            System.out.println("IOException caught : "+e);
        }
    }
}

Output : 

Site : http://www.w3.org
Site : http://www.google.com
Site : http://www.codebytes.in
Site : https://www.blogger.com
Site : http://google
Site : http://schema.org
Site : https://plus.google.com
Site : http://2.bp.blogspot.com
Site : https://apis.google.com
Site : http://www.webplatform.org
Site : https://www.edx.org
Site : http://www.w3devcampus.com
Site : http://testthewebforward.org
Site : https://www.w3.org
Site : https://github.com
Site : https://www.ldc.upenn.edu
Site : http://devvar.org
Site : http://www.industryofthingsworldusa.com
Site : http://webaudio.gatech.edu
Site : http://www2016.ca
Site : http://validator.w3.org
Site : http://jigsaw.w3.org
Site : http://vimeo.com
Site : http://www.fundaciononce.es
Site : http://lists.w3.org
Site : http://twitter.com
Site : http://www.csail.mit.edu
Site : http://www.ercim.eu
Site : http://www.keio.ac.jp
Site : http://ev.buaa.edu.cn
Site : http://www.google.co.in
Site : https://play.google.com
Site : http://www.youtube.com
Site : http://news.google.co.in
Site : https://mail.google.com
Site : https://drive.google.com
Site : https://www.google.co.in
Site : https://accounts.google.com
Site : https://ssl.gstatic.com
Site : https://support.google.com
Site : https://www.google.com

IOException for URL : http://google

Site : http://github.com
Site : http://blog.schema.org

IOException for URL : http://2.bp.blogspot.com


IOException for URL : https://apis.google.com

Site : http://docs.webplatform.org
Site : http://en.wikipedia.org
Site : http://blog.webplatform.org
Site : https://twitter.com
Site : https://www.facebook.com
Site : http://webchat.freenode.net
Site : http://richard.esplins.org
Site : https://developers.google.com
Site : http://5by5.tv
Site : https://stats.webplatform.org

IOException for URL : https://www.edx.org

Site : http://yoast.com
Site : http://wp.me
Site : http://wordpress.org
Site : http://www.linkedin.com
Site : https://classroom.w3devcampus.com
Site : http://classroom.w3devcampus.com
Site : http://w3cshop.spreadshirt.net
Site : http://eepurl.com
Site : http://ssl.gstatic.com
Site : https://t.co
Site : http://stats.wordpress.com
Site : http://ogp.me
Site : https://assets
Site : https://api.github.com
Site : https://enterprise.github.com
Site : https://help.github.com
Site : https://desktop.github.com
Site : https://status.github.com
Site : https://developer.github.com
Site : https://training.github.com
Site : https://shop.github.com
Site : http://purl.org
Site : http://xmlns.com
Site : http://rdfs.org
Site : http://drupal.org
Site : http://ldc
Site : https://www.youtube.com
Site : http://www.upenn.edu
Site : http://catalog.ldc.upenn.edu
Site : https://catalog.ldc.upenn.edu
Site : http://www.ldc.upenn.edu
Site : https://ssl
Site : http://www
Site : https://freenode.net
Site : http://www.brownbaglunch.fr
Site : http://clermontech.org
Site : http://www.meetup.com
Site : http://afpyro.afpy.org
Site : http://www.aperoweb.fr
Site : http://www.lacantine
Site : http://www.43117.tl
Site : http://toulonux.org
Site : http://gullivar.org
Site : http://tedxtoulon.com
Site : http://e1
Site : http://fonts.googleapis.com
Site : http://html5shiv.googlecode.com
Site : https://www.linkedin.com
Site : https://www.xing.com
Site : http://www.deliveryofthingsworld.com
Site : http://www.securityofthingsworld.com
Site : http://www.industryofthingsworld.com
Site : http://www.we
Site : http://www.facebook.com


Results:

Web sites crawled : 107

http://catalog.ldc.upenn.edu
https://www.linkedin.com
http://www
https://www.blogger.com
https://support.google.com
http://twitter.com
https://catalog.ldc.upenn.edu
http://jigsaw.w3.org
https://ssl
https://www.youtube.com
http://www.google.co.in
https://www.google.com
http://docs.webplatform.org
https://mail.google.com
https://t.co
http://www.webplatform.org
https://assets
http://www.csail.mit.edu
https://twitter.com
http://drupal.org
http://www.upenn.edu
http://www.deliveryofthingsworld.com
http://devvar.org
http://www.aperoweb.fr
https://classroom.w3devcampus.com
https://ssl.gstatic.com
https://shop.github.com
http://blog.schema.org
http://ssl.gstatic.com
http://rdfs.org
http://www2016.ca
http://www.linkedin.com
http://yoast.com
http://clermontech.org
http://afpyro.afpy.org
http://en.wikipedia.org
http://www.lacantine
http://www.fundaciononce.es
http://news.google.co.in
http://www.codebytes.in
http://html5shiv.googlecode.com
http://blog.webplatform.org
http://eepurl.com
https://plus.google.com
https://status.github.com
https://help.github.com
http://purl.org
http://google
http://www.industryofthingsworld.com
http://stats.wordpress.com
http://e1
https://drive.google.com
https://developers.google.com
http://validator.w3.org
http://www.we
http://www.facebook.com
https://training.github.com
http://toulonux.org
https://www.facebook.com
https://github.com
https://play.google.com
http://wordpress.org
http://webchat.freenode.net
http://w3cshop.spreadshirt.net
http://www.w3.org
http://www.43117.tl
https://developer.github.com
http://www.ercim.eu
http://github.com
http://ogp.me
http://www.brownbaglunch.fr
http://schema.org
http://fonts.googleapis.com
https://accounts.google.com
http://xmlns.com
http://www.google.com
http://www.securityofthingsworld.com
https://apis.google.com
https://freenode.net
http://vimeo.com
http://2.bp.blogspot.com
http://ev.buaa.edu.cn
https://stats.webplatform.org
http://www.ldc.upenn.edu
http://tedxtoulon.com
http://wp.me
http://www.w3devcampus.com
http://richard.esplins.org
http://www.youtube.com
http://webaudio.gatech.edu
https://www.ldc.upenn.edu
http://lists.w3.org
http://classroom.w3devcampus.com
https://www.edx.org
http://testthewebforward.org
http://ldc
http://gullivar.org
https://enterprise.github.com
https://api.github.com
https://www.xing.com
https://www.w3.org
http://www.industryofthingsworldusa.com
https://www.google.co.in
http://5by5.tv
https://desktop.github.com
http://www.meetup.com
http://www.keio.ac.jp

No comments:

Post a Comment