From 38023c1aa583468b912076ac5b5b799b644dce30 Mon Sep 17 00:00:00 2001 From: christian Date: Sat, 8 Jun 2024 21:03:19 +0200 Subject: [PATCH] fixed issue where it would write the jobs twice --- scraper.go | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/scraper.go b/scraper.go index 00a6023..8a2b32d 100644 --- a/scraper.go +++ b/scraper.go @@ -40,13 +40,27 @@ func skillChecker(description string) skills { Typescript: strings.Contains(description, "TypeScript"), } } +func jobsToJson(file *os.File) { + // Encode jobs slice to JSON + encoder := json.NewEncoder(file) + encoder.SetIndent("", " ") // Pretty-print with indentation + if err := encoder.Encode(jobs); err != nil { + log.Fatalf("Cannot write to file %q: %s", fName, err) + } + + fmt.Println("Job details successfully written to", fName) +} // Slice to store job details -var jobs []job +var ( + jobs []job + jobCount int + maxJobs int = 30 + fName string = "jobs.json" +) func main() { - fName := "jobs.json" file, err := os.Create(fName) if err != nil { log.Fatalf("Cannot create file %q: %s", fName, err) @@ -69,6 +83,8 @@ func main() { detailsCollector := c.Clone() // On every
element with class "card__content attribute call callback c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { + //ensure only scrape the amount of jobs specified + // Get the title and ensure it doesn't contain any excluded words title := e.ChildText("span.card-job-find-list__position") for _, excludedWord := range excluded { @@ -80,15 +96,16 @@ func main() { fullLink := baseUrl + link detailsCollector.Visit(fullLink) + }) detailsCollector.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) }) - detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) { - // Get logo and trim the url + detailsCollector.OnHTML("div[class='view-job-details']", func(e *colly.HTMLElement) { + // Get logo and trim the url logo := e.ChildAttr("div.media-item__image", "style") cutLeft := "background-image:url(" cutRight := ");" @@ -106,6 +123,12 @@ func main() { Skills: skillChecker(e.ChildText("content.text-block__content > span")), } jobs = append(jobs, jobDetails) + jobCount++ + fmt.Println("Scraped job", jobCount) + if jobCount == maxJobs { + jobsToJson(file) + os.Exit(0) + } }) // Handle pagination c.OnHTML("a.page-link", func(e *colly.HTMLElement) { @@ -119,12 +142,4 @@ func main() { c.Visit(searchString) - // Encode jobs slice to JSON - encoder := json.NewEncoder(file) - encoder.SetIndent("", " ") // Pretty-print with indentation - if err := encoder.Encode(jobs); err != nil { - log.Fatalf("Cannot write to file %q: %s", fName, err) - } - - fmt.Println("Job details successfully written to", fName) }