package main import ( "context" "fmt" "strings" "time" "github.com/aws/aws-lambda-go/lambda" "github.com/gocolly/colly" ) type job struct { Title string `json:"title"` Logo string `json:"logo"` Company string `json:"company"` Location string `json:"location"` Type string `json:"type"` Description string `json:"description"` Link string `json:"link"` Skills skills `json:"skills"` } type skills struct { React bool `json:"react"` Python bool `json:"python"` Golang bool `json:"golang"` Svelte bool `json:"svelte"` Nextjs bool `json:"nextjs"` Typescript bool `json:"typescript"` } var ( jobs []job lastFetch time.Time cacheTTL = time.Minute * 5 jobLimit = 20 ) func skillChecker(description string) skills { return skills{ React: strings.Contains(description, "React"), Python: strings.Contains(description, "Python"), Golang: strings.Contains(description, "Go"), Svelte: strings.Contains(description, "Svelte"), Nextjs: strings.Contains(description, "Next.js"), Typescript: strings.Contains(description, "TypeScript"), } } func fetchData() error { baseUrl := "https://thehub.io" // Instantiate default collector c := colly.NewCollector( // visit only the hub colly.AllowedDomains("www.thehub.io", "thehub.io"), // Cache responses to prevent multiple requests colly.CacheDir("./thehub_cache"), ) // Slice of excluded words in the job titles excluded := []string{"senior", "lead"} // Instantiate a new collector to visit the job details page detailsCollector := c.Clone() // Limit the number of jobs to fetch jobCount := 0 // On every
element with class "card__content attribute call callback c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { // Return if the job limit has been reached if jobCount >= jobLimit { return } // Get the title and ensure it doesn't contain any excluded words title := e.ChildText("span.card-job-find-list__position") for _, excludedWord := range excluded { if strings.Contains(strings.ToLower(title), excludedWord) { return } } link := e.ChildAttr("a", "href") fullLink := baseUrl + link detailsCollector.Visit(fullLink) }) detailsCollector.OnRequest(func(r *colly.Request) { fmt.Println("Visiting", r.URL.String()) }) detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) { if jobCount >= jobLimit { return } // Get logo and trim the url logo := e.ChildAttr("div.media-item__image", "style") cutLeft := "background-image:url(" cutRight := ");" trimmedLogo := strings.Trim(logo, cutLeft+cutRight) // Get company name jobDetails := job{ Title: e.ChildText("h2[class=view-job-details__title]"), Logo: trimmedLogo, Company: e.ChildText(".bullet-inline-list > a:first-child"), Location: e.ChildText(".bullet-inline-list > a:nth-child(2)"), Type: e.ChildText(".bullet-inline-list > a:nth-child(3)"), Description: e.ChildText("content.text-block__content > span"), Link: e.Request.URL.String(), Skills: skillChecker(e.ChildText("content.text-block__content > span")), } jobs = append(jobs, jobDetails) jobCount++ }) // Handle pagination c.OnHTML("a.page-link", func(e *colly.HTMLElement) { nextPage := e.Attr("href") if nextPage != "" { fullNextPage := baseUrl + nextPage fmt.Println("Visiting next page:", fullNextPage) e.Request.Visit(fullNextPage) } }) // Visit the initial URL to start scraping err := c.Visit("https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&search=developer&paid=true&countryCode=DK&sorting=newJobs") if err != nil { return err } return nil } func handler(ctx context.Context) ([]job, error) { // Check if cache is valid if time.Since(lastFetch) < cacheTTL && len(jobs) > 0 { return jobs, nil } // Fetch new data err := fetchData() if err != nil { return nil, err } // Update cache timestamp lastFetch = time.Now() return jobs, nil } func main() { lambda.Start(handler) }