package main import ( "encoding/json" "fmt" "log" "os" "strings" "time" "github.com/gocolly/colly" ) type job struct { Title string `json:"title"` Logo string `json:"logo"` Company string `json:"company"` Location string `json:"location"` Type string `json:"type"` Description string `json:"description"` Link string `json:"link"` Skills skills `json:"skills"` Scraped string `json:"scraped"` Source string `json:"source"` } type skills struct { React bool `json:"react"` Python bool `json:"python"` Golang bool `json:"golang"` Svelte bool `json:"svelte"` Nextjs bool `json:"nextjs"` Typescript bool `json:"typescript"` Tailwind bool `json:"tailwind"` } // Utility functions // Checks if a string contains any of the given keywords func skillChecker(description string) skills { return skills{ React: strings.Contains(description, "React"), Python: strings.Contains(description, "Python"), Golang: strings.Contains(description, "Golang"), Svelte: strings.Contains(description, "Svelte"), Nextjs: strings.Contains(description, "Next.js"), Typescript: strings.Contains(description, "TypeScript"), Tailwind: strings.Contains(description, "Tailwind"), } } // Converts job struct to json func jobsToJson(file *os.File, jobs []job, fName string) { // Encode jobs slice to JSON encoder := json.NewEncoder(file) encoder.SetIndent("", " ") // Pretty-print with indentation if err := encoder.Encode(jobs); err != nil { log.Fatalf("Cannot write to file %q: %s", fName, err) } fmt.Println("Job details successfully written to", fName) } func checkIfPaid(description string) { for _, keyword := range unpaidKeywords { if strings.Contains(strings.ToLower(description), keyword) { return } } } func checkIfStudent(description string) string { for _, keyword := range studentKeywords { if strings.Contains(strings.ToLower(description), keyword) { return "student" } } return "full time" } // Slice to store job details var ( excluded = []string{"senior", "lead", "founder", "cto", "vp of", "erfaren", "arkitekt", "architect", "manager", "ulønnet", "unpaid", "praktik", "cyber", "leder", "sikkerhed", "supporter", "sr."} unpaidKeywords = []string{"unpaid", "praktik", "ulønnet"} studentKeywords = []string{"studerende", "studenter", "student", "medhjælper"} ) func scrapeHub() { var ( jobs []job jobCount int fName = "/app/data/thehub.json" maxJobs = 20 baseUrl = "https://thehub.io" searchString = "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs" ) // Create file after scraping is complete c := colly.NewCollector( colly.AllowedDomains("www.thehub.io", "thehub.io"), ) detailsCollector := colly.NewCollector( colly.AllowedDomains("www.thehub.io", "thehub.io"), colly.CacheDir("/app/data/thehub_cache"), ) c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } title := e.ChildText("span.card-job-find-list__position") for _, excludedWord := range excluded { if strings.Contains(strings.ToLower(title), excludedWord) { return } } link := e.ChildAttr("a", "href") fullLink := baseUrl + link detailsCollector.Visit(fullLink) }) detailsCollector.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) }) detailsCollector.OnHTML("div[class='view-job-details']", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } logo := e.ChildAttr("div.media-item__image", "style") cutLeft := "background-image:url(" cutRight := ");" trimmedLogo := strings.Trim(logo, cutLeft+cutRight) descriptionHTML, err := e.DOM.Find("content.text-block__content > span").Html() if err != nil { log.Printf("Error getting HTML of description: %s", err) return } jobDetails := job{ Title: e.ChildText("h2[class=view-job-details__title]"), Logo: trimmedLogo, Company: e.ChildText(".bullet-inline-list > a:first-child"), Location: e.ChildText(".bullet-inline-list > a:nth-child(2)"), Type: e.ChildText(".bullet-inline-list > a:nth-child(3)"), Description: descriptionHTML, Link: e.Request.URL.String(), Skills: skillChecker(e.ChildText("content.text-block__content > span")), Scraped: time.Now().String(), Source: baseUrl, } jobs = append(jobs, jobDetails) jobCount++ fmt.Printf("Scraped job %d from TheHub\n", jobCount) }) c.OnHTML("a.page-link", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } nextPage := e.Attr("href") if nextPage != "" { fullNextPage := baseUrl + nextPage e.Request.Visit(fullNextPage) } }) // Add error handling for the initial visit err := c.Visit(searchString) if err != nil { log.Printf("Error visiting TheHub: %s", err) return } // Wait for all collectors to finish c.Wait() detailsCollector.Wait() // Write jobs to file after scraping is complete if len(jobs) > 0 { file, err := os.Create(fName) if err != nil { log.Printf("Cannot create file %q: %s", fName, err) return } defer file.Close() jobsToJson(file, jobs, fName) fmt.Printf("Successfully scraped %d jobs from TheHub\n", len(jobs)) } else { log.Println("No jobs were scraped from TheHub") } } func scrapeItJobBank() { var ( jobs []job jobCount int fName = "/app/data/it-jobbank.json" maxJobs = 20 baseUrl = "https://www.it-jobbank.dk" searchString = "https://www.it-jobbank.dk/jobsoegning/udvikling" ) c := colly.NewCollector( colly.AllowedDomains("www.it-jobbank.dk", "it-jobbank.dk"), ) detailsCollector := colly.NewCollector( colly.AllowedDomains("www.it-jobbank.dk", "it-jobbank.dk"), colly.CacheDir("/app/data/itjobbank_cache"), ) c.OnHTML("div[class=result]", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } title := e.ChildText("h3.job-title > a") for _, excludedWord := range excluded { if strings.Contains(strings.ToLower(title), excludedWord) { return } } fullLink := e.ChildAttr("h3.job-title > a", "href") detailsCollector.Visit(fullLink) }) detailsCollector.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) }) detailsCollector.OnHTML("section > div", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } descriptionHTML, err := e.DOM.Find("div[id=job_ad]").Html() if err != nil { log.Printf("Error getting HTML of description: %s", err) return } checkIfPaid(descriptionHTML) title := e.ChildText("h1.title") if title == "" { title = e.ChildText("h1[id=jobtitle]") } jobDetails := job{ Title: title, Logo: baseUrl + e.ChildAttr("div.company-logo > img", "src"), Company: e.ChildText("p.published"), Location: e.ChildText("div.job-location > p.caption"), Type: checkIfStudent(descriptionHTML), Description: descriptionHTML, Link: e.Request.URL.String(), Skills: skillChecker(descriptionHTML), Scraped: time.Now().String(), Source: baseUrl, } jobs = append(jobs, jobDetails) jobCount++ fmt.Printf("Scraped job %d from IT JobBank\n", jobCount) }) c.OnHTML("a.page-link", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } nextPage := e.Attr("href") if nextPage != "" { e.Request.Visit(nextPage) } }) // Add error handling for the initial visit err := c.Visit(searchString) if err != nil { log.Printf("Error visiting IT JobBank: %s", err) return } // Wait for all collectors to finish c.Wait() detailsCollector.Wait() // Write jobs to file after scraping is complete if len(jobs) > 0 { file, err := os.Create(fName) if err != nil { log.Printf("Cannot create file %q: %s", fName, err) return } defer file.Close() jobsToJson(file, jobs, fName) fmt.Printf("Successfully scraped %d jobs from IT JobBank\n", len(jobs)) } else { log.Println("No jobs were scraped from IT JobBank") } } func main() { scrapeHub() scrapeItJobBank() }