diff --git a/main.go b/main.go index 1fb8590..080ae56 100644 --- a/main.go +++ b/main.go @@ -86,7 +86,6 @@ var ( ) func scrapeHub() { - // declare and initialize variables var ( jobs []job jobCount int @@ -94,32 +93,23 @@ func scrapeHub() { maxJobs = 20 baseUrl = "https://thehub.io" searchString = "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs" - file, err = os.Create(fName) ) - if err != nil { - log.Fatalf("Cannot create file %q: %s", fName, err) - } - defer file.Close() - - // Instantiate default collector + // Create file after scraping is complete c := colly.NewCollector( - // visit only the hub colly.AllowedDomains("www.thehub.io", "thehub.io"), ) - // Instantiate a new collector to visit the job details page detailsCollector := colly.NewCollector( colly.AllowedDomains("www.thehub.io", "thehub.io"), colly.CacheDir("/app/data/thehub_cache"), ) - // On every
element with class "card__content attribute call callback + c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } - // Get the title and ensure it doesn't contain any excluded words title := e.ChildText("span.card-job-find-list__position") for _, excludedWord := range excluded { if strings.Contains(strings.ToLower(title), excludedWord) { @@ -130,7 +120,6 @@ func scrapeHub() { fullLink := baseUrl + link detailsCollector.Visit(fullLink) - }) detailsCollector.OnRequest(func(r *colly.Request) { @@ -138,21 +127,21 @@ func scrapeHub() { }) detailsCollector.OnHTML("div[class='view-job-details']", func(e *colly.HTMLElement) { + if jobCount >= maxJobs { + return + } - // Get logo and trim the url logo := e.ChildAttr("div.media-item__image", "style") cutLeft := "background-image:url(" cutRight := ");" trimmedLogo := strings.Trim(logo, cutLeft+cutRight) - // Get the HTML of the description and check to see if it's paid descriptionHTML, err := e.DOM.Find("content.text-block__content > span").Html() if err != nil { log.Printf("Error getting HTML of description: %s", err) return } - // fill in the job struct jobDetails := job{ Title: e.ChildText("h2[class=view-job-details__title]"), Logo: trimmedLogo, @@ -167,14 +156,13 @@ func scrapeHub() { } jobs = append(jobs, jobDetails) jobCount++ - fmt.Println("Scraped job", jobCount) + fmt.Printf("Scraped job %d from TheHub\n", jobCount) + }) + + c.OnHTML("a.page-link", func(e *colly.HTMLElement) { if jobCount >= maxJobs { - jobsToJson(file, jobs, fName) return } - }) - // Handle pagination - c.OnHTML("a.page-link", func(e *colly.HTMLElement) { nextPage := e.Attr("href") if nextPage != "" { fullNextPage := baseUrl + nextPage @@ -182,12 +170,34 @@ func scrapeHub() { } }) - c.Visit(searchString) + // Add error handling for the initial visit + err := c.Visit(searchString) + if err != nil { + log.Printf("Error visiting TheHub: %s", err) + return + } + // Wait for all collectors to finish + c.Wait() + detailsCollector.Wait() + + // Write jobs to file after scraping is complete + if len(jobs) > 0 { + file, err := os.Create(fName) + if err != nil { + log.Printf("Cannot create file %q: %s", fName, err) + return + } + defer file.Close() + + jobsToJson(file, jobs, fName) + fmt.Printf("Successfully scraped %d jobs from TheHub\n", len(jobs)) + } else { + log.Println("No jobs were scraped from TheHub") + } } func scrapeItJobBank() { - // declare and initialize variables var ( jobs []job jobCount int @@ -195,34 +205,22 @@ func scrapeItJobBank() { maxJobs = 20 baseUrl = "https://www.it-jobbank.dk" searchString = "https://www.it-jobbank.dk/jobsoegning/udvikling" - file, err = os.Create(fName) ) - if err != nil { - log.Fatalf("Cannot create file %q: %s", fName, err) - } - defer file.Close() - // Instantiate default collector c := colly.NewCollector( - // visit only the hub colly.AllowedDomains("www.it-jobbank.dk", "it-jobbank.dk"), - - // Cache responses to prevent multiple requests - // colly.CacheDir("./itjobbank_cache"), ) - // Instantiate a new collector to visit the job details page detailsCollector := colly.NewCollector( colly.AllowedDomains("www.it-jobbank.dk", "it-jobbank.dk"), - // Cache responses to prevent multiple requests - colly.CacheDir("/app/data/itjobbank_cache")) + colly.CacheDir("/app/data/itjobbank_cache"), + ) - // On every
element with class "card__content attribute call callback c.OnHTML("div[class=result]", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } - // Get the title and ensure it doesn't contain any excluded words + title := e.ChildText("h3.job-title > a") for _, excludedWord := range excluded { if strings.Contains(strings.ToLower(title), excludedWord) { @@ -239,19 +237,23 @@ func scrapeItJobBank() { }) detailsCollector.OnHTML("section > div", func(e *colly.HTMLElement) { - // get the description as html + if jobCount >= maxJobs { + return + } + descriptionHTML, err := e.DOM.Find("div[id=job_ad]").Html() if err != nil { log.Printf("Error getting HTML of description: %s", err) return } - // Check if the job is paid + checkIfPaid(descriptionHTML) - // fill in the job struct + title := e.ChildText("h1.title") if title == "" { title = e.ChildText("h1[id=jobtitle]") } + jobDetails := job{ Title: title, Logo: baseUrl + e.ChildAttr("div.company-logo > img", "src"), @@ -266,13 +268,9 @@ func scrapeItJobBank() { } jobs = append(jobs, jobDetails) jobCount++ - fmt.Println("Scraped job", jobCount) - if jobCount >= maxJobs { - jobsToJson(file, jobs, fName) - return - } + fmt.Printf("Scraped job %d from IT JobBank\n", jobCount) }) - // Handle pagination + c.OnHTML("a.page-link", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return @@ -283,7 +281,31 @@ func scrapeItJobBank() { } }) - c.Visit(searchString) + // Add error handling for the initial visit + err := c.Visit(searchString) + if err != nil { + log.Printf("Error visiting IT JobBank: %s", err) + return + } + + // Wait for all collectors to finish + c.Wait() + detailsCollector.Wait() + + // Write jobs to file after scraping is complete + if len(jobs) > 0 { + file, err := os.Create(fName) + if err != nil { + log.Printf("Cannot create file %q: %s", fName, err) + return + } + defer file.Close() + + jobsToJson(file, jobs, fName) + fmt.Printf("Successfully scraped %d jobs from IT JobBank\n", len(jobs)) + } else { + log.Println("No jobs were scraped from IT JobBank") + } } func main() {