Attached is one of the F# scripts I've really enjoyed showing to others. It's two web crawling samples (one synchronous, the next asynchronous) followed by the random-walk-of-the-web-of-life. You may want to adapt the URLs to be ones that are closer to home for your own interests. You should also feel free to adapt the code to do more interesting things, e.g. do more interesting analysis of the HTML coming off the web. Consider the code an open sample fot the use of the F# community.
It's a script, so it works best if you go through it line by line using F# Interactive and Visual Studio, though it also works if you use F# Interactive on a command line and paste lines of code into it.
Show your friends and take them through it line by line - it really is enormous fun. I've also pasted the code below to give the code highlighter a decent run :-)
//---------------------------------------------------------------------------
// Part O. Hello World
//System.Console.WriteLine("Hello World");;
System.Console.WriteLine("Hello World");;
open Printf;;
printf "Hello World\n";;
//---------------------------------------------------------------------------
// Part I. Web.
open System.Net
open System
open System.IO
let id x = x
let req = WebRequest.Create("http://www.microsoft.com")
let resp = req.GetResponse()
let stream = resp.GetResponseStream()
let reader = new IO.StreamReader(stream)
let html = reader.ReadToEnd();;
html;;
/// Fetch the contents of a web page
let http(url: string) =
let req = WebRequest.Create(url) in
let resp = req.GetResponse() in
let stream = resp.GetResponseStream() in
let reader = new IO.StreamReader(stream) in
let html = reader.ReadToEnd() in
resp.Close();
html
let google = http("http://www.google.com");;
let bbc = http("http://news.bbc.co.uk");;
let msft = http("http://www.microsoft.com");;
let nytRSS = http("http://www.nytimes.com/services/xml/rss/nyt/HomePage.xml");;
//let bbcRSS = http("http://www.bbc.co.uk/go/homepage/int/ne/nrss/log/i/-/news/rss/newsonline_uk_edition/front_page/rss.xml")
// ----------------------------
// Windows Forms
open System.Windows.Forms;;
open System.Drawing;;
let form = new Form();;
form.Visible <- true;;
form.Text <- "Welcome to F# Interactive Programming";;
form.TopMost <- true;;
let textB = new RichTextBox();;
form.Controls.Add(textB);;
textB.Dock <- DockStyle.Fill ;;
textB.Text <- nytRSS;;
textB.ForeColor <- Color.DarkBlue;;
textB.Font <- new Font("Lucida Console",12.0f,FontStyle.Bold) ;
form.Size <- new Size(400,600);;
let setText text =
textB.Text <- text
let appendText text = textB.AppendText(text + "\n");;
setText "hello";;
setText "hello again";;
//let (|>) x f = f x
let any_to_string_ex opts x = x |> any_to_layout opts |> layout_to_string opts
let show x =
let opts= { format_options.Default with printWidth = form.Width/16 } in
setText (any_to_string_ex opts x);;
(1,2,3) |> show;;
Array.create 100 (1,2,3) |> show;;
nytRSS |> setText;;
show 1;;
// ----------------------------
// Scan RSS for news titles
open System.Xml;;
open System.Collections;;
open System.Collections.Generic;;
let xdoc = new XmlDocument();;
xdoc.LoadXml(nytRSS);;
xdoc.SelectNodes("//title");;
xdoc.SelectNodes("//title") |> show;;
/// Hmmm... XPathNodeList supports System.IEnumerable
/// First extract the text from the nodes then display...
xdoc.SelectNodes("//title")
|> IEnumerable.map_with_type (fun (i:XmlNode) -> i.InnerText)
|> IEnumerable.to_list
|> show;;
// ----------------------------
// Search for URLs in HTML
open System.Text.RegularExpressions;;
let httpPat = "http://[a-z-A-Z0-9./_]*"
let urlPat = "href=\s*\"[^\"h]*(http://[^&\"]*)\"";;
let bbcUrls = Regex.Matches(bbc,urlPat);;
let getUrls (txt:string) =
Regex.Matches(txt,urlPat)
|> IEnumerable.map_with_type (fun (m:Match) -> (m.Groups.Item(1)).Value)
|> IEnumerable.to_list;;
let collectUrls url =
appendText url;
Application.DoEvents();
let html =
try http(url)
with _ -> "" in
let urls = getUrls html in
urls;;
collectUrls "http://news.google.com" |> show;;
// ----------------------------
// Crawling (Synchronous)
let crawlLimit = 10;;
let rec crawl sofar url =
if Set.size sofar >= crawlLimit or Set.mem url sofar
then sofar
else
let urls = collectUrls url in
List.fold_left crawl (Set.add url sofar) urls;;
textB.Clear();;
crawl Set.empty "http://news.google.com";;
// ----------------------------
// HTTP Requests (Asynchronous)
open System.Threading
open Microsoft.FSharp.Idioms
open System.Collections.Generic
let httpAsync (url:string) (cont: string -> unit) =
let req = WebRequest.Create(url) in
let iar =
req.BeginGetResponse((fun iar ->
let rsp = req.EndGetResponse(iar) in
let str = new StreamReader(rsp.GetResponseStream()) in
let html = str.ReadToEnd() in
rsp.Close();
cont html), 0) in
()
do httpAsync "http://www.microsoft.com" (fun html -> show html)
do httpAsync "http://www.google.com" (fun html -> show html)
let collectUrlsAsync url cont = httpAsync url (getUrls >> cont)
do collectUrlsAsync "http://news.google.com" (fun urls -> show urls)
// ----------------------------
// Crawling (Asynchronous)
/// Spawn a worker thread
let spawn (f : unit -> unit) = ThreadPool.QueueUserWorkItem(fun _ -> f ()) |> ignore
/// Add text to the window from a worker thread
let appendTextRemote t =
form.Invoke(new MethodInvoker(fun () -> appendText t)) |> ignore
let addToSet (d: Dictionary<_,_>) x =
let res = d.ContainsKey(x) in
if not res then d.Add(x,1);
res
/// Async crawling
let acrawl(url:string) =
appendTextRemote (sprintf "Crawling %s..." url);
// Local state, protected by locks
let sofar = new Dictionary<_,_>() in
let rec search url =
let wasPresent = lock sofar (fun () -> addToSet sofar url) in
if not wasPresent && sofar.Count < crawlLimit * 2 then begin
spawn (fun () ->
collectUrlsAsync url (fun urls -> List.iter search urls;
appendTextRemote url)
);
end in
spawn (fun () -> search url)
do textB.Clear()
do acrawl "http://news.google.com"
// ---------------------------------------------
// Random web walk
// List functions
// Random numbers
let rand = new System.Random()
let dice n = rand.Next(n)
let diceList xs dflt =
let n = List.length xs in
if n=0 then dflt else List.nth xs (dice n);;
// Web browser control inside a form
open System
open System.Windows.Forms;;
let wb = new WebBrowser();;
wb.Dock <- DockStyle.Fill;;
wb.AllowNavigation <- true;;
let webForm = new Form();;
webForm.Controls.Add(wb);;
webForm.Visible <- true;;
webForm.Size <- new Size(600,400);;
webForm.TopMost <- true;;
// Point it at pages and get the text
wb.Navigate("http://news.bbc.co.uk");;
let text = wb.DocumentText
// Regular expressions
open System.Text.RegularExpressions
let rx = new Regex("http://news.bbc.co.uk/[a-z0-9_.-/]+stm")
// Regular expression to filter urls
let urlsOfDocument (doc : HtmlDocument) =
let urls = doc.Links |> IEnumerable.map_with_type (fun (elt:HtmlElement) -> elt.GetAttribute("href")) in
let urls = urls |> IEnumerable.filter (fun url -> rx.IsMatch(url)) in
let urls = urls |> IEnumerable.to_list in
urls;;
// Test it
urlsOfDocument (wb.Document) |> show;;
let randomLink doc =
let urls = urlsOfDocument doc in
let url = diceList urls "http://news.bbc.co.uk" in
Printf.printf "JUMP: %s\n" url;
url;;
// Test it
randomLink wb.Document |> show;;
// Click on a timer event
let randomClick () = wb.Navigate(randomLink(wb.Document))
let timer = new Timer();;
timer.Interval <- 1500;;
timer.Tick.Add(fun _ -> randomClick ());;
timer.Start();;
// Enough!
timer.Stop();;