package main import ( "encoding/json" "fmt" "log" "os" "strings" "time" "github.com/gocolly/colly" ) type job struct { Title string `json:"title"` Logo string `json:"logo"` Company string `json:"company"` Location string `json:"location"` Type string `json:"type"` Description string `json:"description"` Link string `json:"link"` Skills skills `json:"skills"` Scraped string `json:"scraped"` Source string `json:"source"` } type skills struct { React bool `json:"react"` Python bool `json:"python"` Golang bool `json:"golang"` Svelte bool `json:"svelte"` Nextjs bool `json:"nextjs"` Typescript bool `json:"typescript"` Tailwind bool `json:"tailwind"` } // Utility functions // Checks if a string contains any of the given keywords func skillChecker(description string) skills { return skills{ React: strings.Contains(description, "React"), Python: strings.Contains(description, "Python"), Golang: strings.Contains(description, "Golang"), Svelte: strings.Contains(description, "Svelte"), Nextjs: strings.Contains(description, "Next.js"), Typescript: strings.Contains(description, "TypeScript"), Tailwind: strings.Contains(description, "Tailwind"), } } // Converts job struct to json func jobsToJson(file *os.File, jobs []job, fName string) { // Encode jobs slice to JSON encoder := json.NewEncoder(file) encoder.SetIndent("", " ") // Pretty-print with indentation if err := encoder.Encode(jobs); err != nil { log.Fatalf("Cannot write to file %q: %s", fName, err) } fmt.Println("Job details successfully written to", fName) } func checkIfPaid(description string) { for _, keyword := range unpaidKeywords { if strings.Contains(strings.ToLower(description), keyword) { return } } } func checkIfStudent(description string) string { for _, keyword := range studentKeywords { if strings.Contains(strings.ToLower(description), keyword) { return "student" } } return "full time" } // Slice to store job details var ( excluded = []string{"senior", "lead", "founder", "cto", "vp of", "erfaren", "arkitekt", "architect", "manager", "ulønnet", "unpaid", "praktik", "cyber", "leder", "sikkerhed", "supporter"} unpaidKeywords = []string{"unpaid", "praktik", "ulønnet"} studentKeywords = []string{"studerende", "studenter", "student", "medhjælper"} ) func scrapeHub() { // declare and initialize variables var ( jobs []job jobCount int fName = "thehub.json" maxJobs = 30 baseUrl = "https://thehub.io" searchString = "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs" file, err = os.Create(fName) ) if err != nil { log.Fatalf("Cannot create file %q: %s", fName, err) } defer file.Close() // Instantiate default collector c := colly.NewCollector( // visit only the hub colly.AllowedDomains("www.thehub.io", "thehub.io"), // Cache responses to prevent multiple requests colly.CacheDir("./thehub_cache"), ) // Instantiate a new collector to visit the job details page detailsCollector := c.Clone() // On every
element with class "card__content attribute call callback c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } // Get the title and ensure it doesn't contain any excluded words title := e.ChildText("span.card-job-find-list__position") for _, excludedWord := range excluded { if strings.Contains(strings.ToLower(title), excludedWord) { return } } link := e.ChildAttr("a", "href") fullLink := baseUrl + link detailsCollector.Visit(fullLink) }) detailsCollector.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) }) detailsCollector.OnHTML("div[class='view-job-details']", func(e *colly.HTMLElement) { // Get logo and trim the url logo := e.ChildAttr("div.media-item__image", "style") cutLeft := "background-image:url(" cutRight := ");" trimmedLogo := strings.Trim(logo, cutLeft+cutRight) // Get the HTML of the description and check to see if it's paid descriptionHTML, err := e.DOM.Find("content.text-block__content > span").Html() if err != nil { log.Printf("Error getting HTML of description: %s", err) return } // fill in the job struct jobDetails := job{ Title: e.ChildText("h2[class=view-job-details__title]"), Logo: trimmedLogo, Company: e.ChildText(".bullet-inline-list > a:first-child"), Location: e.ChildText(".bullet-inline-list > a:nth-child(2)"), Type: e.ChildText(".bullet-inline-list > a:nth-child(3)"), Description: descriptionHTML, Link: e.Request.URL.String(), Skills: skillChecker(e.ChildText("content.text-block__content > span")), Scraped: time.Now().String(), Source: baseUrl, } jobs = append(jobs, jobDetails) jobCount++ fmt.Println("Scraped job", jobCount) if jobCount >= maxJobs { jobsToJson(file, jobs, fName) return } }) // Handle pagination c.OnHTML("a.page-link", func(e *colly.HTMLElement) { nextPage := e.Attr("href") if nextPage != "" { fullNextPage := baseUrl + nextPage e.Request.Visit(fullNextPage) } }) c.Visit(searchString) } func scrapeItJobBank() { // declare and initialize variables var ( jobs []job jobCount int fName = "it-jobbank.json" maxJobs = 30 baseUrl = "https://www.it-jobbank.dk" searchString = "https://www.it-jobbank.dk/jobsoegning/udvikling" file, err = os.Create(fName) ) if err != nil { log.Fatalf("Cannot create file %q: %s", fName, err) } defer file.Close() // Instantiate default collector c := colly.NewCollector( // visit only the hub colly.AllowedDomains("www.it-jobbank.dk", "it-jobbank.dk"), // Cache responses to prevent multiple requests colly.CacheDir("./itjobbank_cache"), ) // Instantiate a new collector to visit the job details page detailsCollector := c.Clone() // On every
element with class "card__content attribute call callback c.OnHTML("div[class=result]", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } // Get the title and ensure it doesn't contain any excluded words title := e.ChildText("h3.job-title > a") for _, excludedWord := range excluded { if strings.Contains(strings.ToLower(title), excludedWord) { return } } fullLink := e.ChildAttr("h3.job-title > a", "href") detailsCollector.Visit(fullLink) }) detailsCollector.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) }) detailsCollector.OnHTML("section > div", func(e *colly.HTMLElement) { // get the description as html descriptionHTML, err := e.DOM.Find("div[id=job_ad]").Html() if err != nil { log.Printf("Error getting HTML of description: %s", err) return } // Check if the job is paid checkIfPaid(descriptionHTML) // fill in the job struct title := e.ChildText("h1.title") if title == "" { title = e.ChildText("h1[id=jobtitle]") } jobDetails := job{ Title: title, Logo: baseUrl + e.ChildAttr("div.company-logo > img", "src"), Company: e.ChildText("p.published"), Location: e.ChildText("div.job-location > p.caption"), Type: checkIfStudent(descriptionHTML), Description: descriptionHTML, Link: e.Request.URL.String(), Skills: skillChecker(descriptionHTML), Scraped: time.Now().String(), Source: baseUrl, } jobs = append(jobs, jobDetails) jobCount++ fmt.Println("Scraped job", jobCount) if jobCount >= maxJobs { jobsToJson(file, jobs, fName) return } }) // Handle pagination c.OnHTML("a.page-link", func(e *colly.HTMLElement) { if jobCount >= maxJobs { return } nextPage := e.Attr("href") if nextPage != "" { e.Request.Visit(nextPage) } }) c.Visit(searchString) } func main() { scrapeHub() scrapeItJobBank() }