From 32f83e358b0a72818444abb122bed44bdc9dd10d Mon Sep 17 00:00:00 2001 From: ChrQR Date: Tue, 11 Jun 2024 11:38:05 +0200 Subject: [PATCH] added scraper for it-jobbank --- .gitignore | 4 +- main.go | 178 +++++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 155 insertions(+), 27 deletions(-) diff --git a/.gitignore b/.gitignore index dd569eb..0733327 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,4 @@ /thehub_cache -/jobs.json \ No newline at end of file +/thehub.json +/itjobbank_cache +/it-jobbank.json \ No newline at end of file diff --git a/main.go b/main.go index 3449aeb..501816d 100644 --- a/main.go +++ b/main.go @@ -12,15 +12,16 @@ import ( ) type job struct { - Title string `json:"title"` - Logo string `json:"logo"` - Company string `json:"company"` - Location string `json:"location"` - Type string `json:"type"` - Description string `json:"description"` - Link string `json:"link"` - Skills skills `json:"skills"` - FirstSeen time.Time `json:"firstSeen"` + Title string `json:"title"` + Logo string `json:"logo"` + Company string `json:"company"` + Location string `json:"location"` + Type string `json:"type"` + Description string `json:"description"` + Link string `json:"link"` + Skills skills `json:"skills"` + FirstSeen time.Time `json:"firstSeen"` + Source string `json:"source"` } type skills struct { @@ -33,6 +34,9 @@ type skills struct { Tailwind bool `json:"tailwind"` } +// Utility functions + +// Checks if a string contains any of the given keywords func skillChecker(description string) skills { return skills{ React: strings.Contains(description, "React"), @@ -44,7 +48,9 @@ func skillChecker(description string) skills { Tailwind: strings.Contains(description, "Tailwind"), } } -func jobsToJson(file *os.File) { + +// Converts job struct to json +func jobsToJson(file *os.File, jobs []job, fName string) { // Encode jobs slice to JSON encoder := json.NewEncoder(file) encoder.SetIndent("", " ") // Pretty-print with indentation @@ -55,24 +61,47 @@ func jobsToJson(file *os.File) { fmt.Println("Job details successfully written to", fName) } +func checkIfPaid(description string) { + for _, keyword := range unpaidKeywords { + if strings.Contains(strings.ToLower(description), keyword) { + return + } + } +} + +func checkIfStudent(description string) string { + for _, keyword := range studentKeywords { + if strings.Contains(strings.ToLower(description), keyword) { + return "student" + } + } + return "full time" +} + // Slice to store job details var ( - jobs []job - jobCount int - maxJobs int = 30 - fName string = "jobs.json" - excluded = []string{"senior", "lead", "founder", "cto", "vp of"} + excluded = []string{"senior", "lead", "founder", "cto", "vp of", "erfaren", "arkitekt", "architect", "manager", "ulønnet", "unpaid", "praktik"} + unpaidKeywords = []string{"unpaid", "praktik", "ulønnet"} + studentKeywords = []string{"studerende", "studenter", "student", "medhjælper"} ) func scrapeHub() { + // declare and initialize variables + var ( + jobs []job + jobCount int + fName = "thehub.json" + maxJobs = 30 + baseUrl = "https://thehub.io" + searchString = "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs" + file, err = os.Create(fName) + ) - file, err := os.Create(fName) if err != nil { log.Fatalf("Cannot create file %q: %s", fName, err) } defer file.Close() - baseUrl := "https://thehub.io" - searchString := "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs" + // Instantiate default collector c := colly.NewCollector( // visit only the hub @@ -86,7 +115,9 @@ func scrapeHub() { detailsCollector := c.Clone() // On every
element with class "card__content attribute call callback c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { - //ensure only scrape the amount of jobs specified + if jobCount >= maxJobs { + return + } // Get the title and ensure it doesn't contain any excluded words title := e.ChildText("span.card-job-find-list__position") @@ -114,14 +145,14 @@ func scrapeHub() { cutRight := ");" trimmedLogo := strings.Trim(logo, cutLeft+cutRight) - // Get job description - // Get the HTML of the description + // Get the HTML of the description and check to see if it's paid descriptionHTML, err := e.DOM.Find("content.text-block__content > span").Html() if err != nil { log.Printf("Error getting HTML of description: %s", err) return } - // Get company name + + // fill in the job struct jobDetails := job{ Title: e.ChildText("h2[class=view-job-details__title]"), Logo: trimmedLogo, @@ -132,13 +163,14 @@ func scrapeHub() { Link: e.Request.URL.String(), Skills: skillChecker(e.ChildText("content.text-block__content > span")), FirstSeen: time.Now(), + Source: baseUrl, } jobs = append(jobs, jobDetails) jobCount++ fmt.Println("Scraped job", jobCount) - if jobCount == maxJobs { - jobsToJson(file) - os.Exit(0) + if jobCount >= maxJobs { + jobsToJson(file, jobs, fName) + return } }) // Handle pagination @@ -146,7 +178,6 @@ func scrapeHub() { nextPage := e.Attr("href") if nextPage != "" { fullNextPage := baseUrl + nextPage - fmt.Println("Visiting next page:", fullNextPage) e.Request.Visit(fullNextPage) } }) @@ -155,8 +186,103 @@ func scrapeHub() { } +func scrapeItJobBank() { + // declare and initialize variables + var ( + jobs []job + jobCount int + fName = "it-jobbank.json" + maxJobs = 30 + baseUrl = "https://www.it-jobbank.dk" + searchString = "https://www.it-jobbank.dk/jobsoegning" + file, err = os.Create(fName) + ) + if err != nil { + log.Fatalf("Cannot create file %q: %s", fName, err) + } + defer file.Close() + // Instantiate default collector + c := colly.NewCollector( + // visit only the hub + colly.AllowedDomains("www.it-jobbank.dk", "it-jobbank.dk"), + + // Cache responses to prevent multiple requests + colly.CacheDir("./itjobbank_cache"), + ) + + // Instantiate a new collector to visit the job details page + detailsCollector := c.Clone() + // On every
element with class "card__content attribute call callback + c.OnHTML("div[class=result]", func(e *colly.HTMLElement) { + if jobCount >= maxJobs { + return + } + // Get the title and ensure it doesn't contain any excluded words + title := e.ChildText("h3.job-title > a") + for _, excludedWord := range excluded { + if strings.Contains(strings.ToLower(title), excludedWord) { + return + } + } + fullLink := e.ChildAttr("h3.job-title > a", "href") + + detailsCollector.Visit(fullLink) + }) + + detailsCollector.OnRequest(func(r *colly.Request) { + fmt.Println("Visiting", r.URL.String()) + }) + + detailsCollector.OnHTML("section > div", func(e *colly.HTMLElement) { + // get the description as html + descriptionHTML, err := e.DOM.Find("div[id=job_ad]").Html() + if err != nil { + log.Printf("Error getting HTML of description: %s", err) + return + } + // Check if the job is paid + checkIfPaid(descriptionHTML) + // fill in the job struct + title := e.ChildText("h1.title") + if title == "" { + title = e.ChildText("h1[id=jobtitle]") + } + jobDetails := job{ + Title: title, + Logo: baseUrl + e.ChildAttr("div.companmony-logo > img", "src"), + Company: e.ChildText("p.published"), + Location: e.ChildText("div.job-location > p.caption"), + Type: checkIfStudent(descriptionHTML), + Description: descriptionHTML, + Link: e.Request.URL.String(), + Skills: skillChecker(descriptionHTML), + FirstSeen: time.Now(), + Source: baseUrl, + } + jobs = append(jobs, jobDetails) + jobCount++ + fmt.Println("Scraped job", jobCount) + if jobCount >= maxJobs { + jobsToJson(file, jobs, fName) + return + } + }) + // Handle pagination + c.OnHTML("a.page-link", func(e *colly.HTMLElement) { + if jobCount >= maxJobs { + return + } + nextPage := e.Attr("href") + if nextPage != "" { + e.Request.Visit(nextPage) + } + }) + + c.Visit(searchString) +} func main() { scrapeHub() + scrapeItJobBank() }