Hi again,
This example shows a way to collect emails from RSS feeds of job postings, provided the latter indeed ontain emails, which is (unfortunately) not the case of the links I have put at the end of the example.
In any case, this example can be used to collect links of job postings that interest you, and which you can then store wherever you like for use with part 1...
module JobSearch
open System.Net
open System
open System.IO
open System.Xml
open System.Collections
open System.Collections.Generic
open System.Text.RegularExpressions
(* Preliminary notes : much is taken and only slightly modified from the WebCrawl example by Don Syme in the F# distribution. *)
(* ============================================================================================================== *)
(** A job contains various bits of info that are usually easily spotted in jop openings posted on the web.
- job title : one-liner
- job description : blabla to tell you that application is for great candidates only
- link to a more detailed description ... where you can hopefully some email to spam, err, to contact
On other webpages, such as job portals where queries can be made up, e.g. <A href="http://foo/search.cgi?keyword=baker">http://foo/search.cgi?keyword=baker</A> ,
the same strategy may be used although it incurs parsing html to spot descriptions etc.
*)
module Job =
begin
type t =
{
title : string ; (* e.g., car driver *)
description : string ; (* e;g., be a car driver for the NYC taxi driver company *)
link : string ; (* a href="<A href="http://www.foobar.com/">http://www.foobar.com</A>" *)
}
let empty =
{
title = "" ;
description = "" ;
link = "" ;
}
let title x = x.title
let description x = x.description
let link x = x.link
end
(* ============================================================================================================== *)
(** In this module, we keep a fnction equivalent to the .Net Regex.IsMatch + a few default patterns.
This allows easy addition of common / recurring patterns if you wish to add some...
*)
module Pattern =
begin
let is_match (s:string) (pat:string) = Regex.IsMatch(s, pat)
module Defaults =
begin
// <A href="http://regexlib.com/DisplayPatterns.aspx">http://regexlib.com/DisplayPatterns.aspx</A> the following is simple enough for emails found on web job postings
let email = "^(([0-9a-zA-Z]([-.\w]*[0-9a-zA-Z])*@([0-9a-zA-Z][-\w]*[0-9a-zA-Z]\.)+[a-zA-Z]{2,9}))$"
let website = "[a-z]{3,5}://[a-z-A-Z0-9./_]+"
let href = "href=\s*\"[^\"]([a-z]{3,5}://[^\"]+)\""
end
end
(* ============================================================================================================== *)
(** This module only include a function to fetch the content of a remote page (whether RSS feed or normal webpage.
It also has a parsing function that returns a list of the matches for a given pattern in the given remote
page content.
*)
module Webpage =
begin
let of_uri (uri : string) =
let req = WebRequest.Create(uri) in
let resp = req.GetResponse() in
let stream = resp.GetResponseStream() in
let reader = new IO.StreamReader(stream) in
let content = reader.ReadToEnd() in
content
let matches content (pat:string) = Regex.Matches(content, pat)
|> IEnumerable.map_with_type (fun (m:Match) -> (m.Groups.Item(1)).Value)
|> IEnumerable.to_list
end
(* ============================================================================================================== *)
(** Concrete example with a basic RSS feed
usage :
let emails () =
let rss = Rss.make "<A href="http://.../rss.xml">http://.../rss.xml</A>" in
Rss.clean ["cab"; "manhattan"] ["poor salary"; "mean boss"; "bronx"] ;
Rss.all_emails
*)
module Rss =
begin
let all = ref ([] : Job.t list)
let make_job t d l : Job.t = {title = t; description = d; link = l}
let get_info content =
let xdoc = new XmlDocument() in
xdoc.LoadXml(content) ;
xdoc.SelectNodes("//item")
|> IEnumerable.map_with_type (fun (i:XmlNode) -> i.Item("title").InnerText, i.Item("description").InnerText, i.Item("link").InnerText )
|> IEnumerable.to_list
|> List.iter (fun (t, d, l) -> all := (make_job t d l) :: !all)
let make uri =
let content = Webpage.of_uri uri in
get_info content
let clean matching_patterns avoid_patterns =
let find f job pat = Pattern.is_match (f job) pat in
if List.length matching_patterns > 0 then
matching_patterns |> List.iter
(fun pat ->
all := !all |> List.filter (fun job -> (find Job.title job pat) || (find Job.description job pat))
)
elif List.length avoid_patterns > 0 then
matching_patterns |> List.iter
(fun pat ->
all := !all |> List.filter (fun job -> not ((find Job.title job pat) || (find Job.description job pat)))
)
else ()
let all_emails =
if List.length !all = 0 then []
else begin
let emails = ref [] in
let get_link_content = Webpage.of_uri in
let pattern = Pattern.Defaults.email in
let get_emails_from_link uri = Webpage.matches (get_link_content uri) pattern in
!all |> List.iter (fun job -> emails := List.append (get_emails_from_link (Job.link job)) !emails) ;
!emails
end
end
(* example *)
let _ =
let some_feeds = [
"http://jobs.efinancialcareers.co.uk/UK.rss" ;
"http://jobs.efinancialcareers.co.uk/France.rss" ;
"http://jobs.efinancialcareers.co.uk/Switzerland.rss" ;
]
in
some_feeds |> List.map Rss.make in
Rss.clean ["equity|fx|interest rate|commodity|commodities"] ["assistant";"credit";"sales"] ;
Rss.all_emails |> List.iter (fun s -> print_string (s ^"\n")) // won't print anything since emails aren't displayed in the job postings :'(
print_any Rss.all // should print all job postings matching the above regex's
Note : this requires compiler v. 1.1.12.5 (because of the node.Item("foo").InnerText syntax.