sw-jobs-go-lambda/main.go

package main

import (
	"context"
	"fmt"
	"strings"
	"time"

	"github.com/aws/aws-lambda-go/lambda"
	"github.com/gocolly/colly"
)

type job struct {
	Title       string `json:"title"`
	Logo        string `json:"logo"`
	Company     string `json:"company"`
	Location    string `json:"location"`
	Type        string `json:"type"`
	Description string `json:"description"`
	Link        string `json:"link"`
	Skills      skills `json:"skills"`
}

type skills struct {
	React      bool `json:"react"`
	Python     bool `json:"python"`
	Golang     bool `json:"golang"`
	Svelte     bool `json:"svelte"`
	Nextjs     bool `json:"nextjs"`
	Typescript bool `json:"typescript"`
}

var (
	jobs      []job
	lastFetch time.Time
	cacheTTL  = time.Minute * 5
	jobLimit  = 20
)

func skillChecker(description string) skills {
	return skills{
		React:      strings.Contains(description, "React"),
		Python:     strings.Contains(description, "Python"),
		Golang:     strings.Contains(description, "Go"),
		Svelte:     strings.Contains(description, "Svelte"),
		Nextjs:     strings.Contains(description, "Next.js"),
		Typescript: strings.Contains(description, "TypeScript"),
	}
}

func fetchData() error {

	baseUrl := "https://thehub.io"
	// Instantiate default collector
	c := colly.NewCollector(
		// visit only the hub
		colly.AllowedDomains("www.thehub.io", "thehub.io"),

		// Cache responses to prevent multiple requests
		colly.CacheDir("./tmp"),
	)

	// Slice of excluded words in the job titles
	excluded := []string{"senior", "lead"}
	// Instantiate a new collector to visit the job details page
	detailsCollector := c.Clone()

	// Limit the number of jobs to fetch
	jobCount := 0

	// On every <div> element with class "card__content attribute call callback
	c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) {
		// Return if the job limit has been reached
		if jobCount >= jobLimit {
			return
		}
		// Get the title and ensure it doesn't contain any excluded words
		title := e.ChildText("span.card-job-find-list__position")
		for _, excludedWord := range excluded {
			if strings.Contains(strings.ToLower(title), excludedWord) {
				return
			}
		}
		link := e.ChildAttr("a", "href")
		fullLink := baseUrl + link

		detailsCollector.Visit(fullLink)
	})

	detailsCollector.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
	})

	detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) {
		if jobCount >= jobLimit {
			return
		}
		// Get logo and trim the url

		logo := e.ChildAttr("div.media-item__image", "style")
		cutLeft := "background-image:url("
		cutRight := ");"
		trimmedLogo := strings.Trim(logo, cutLeft+cutRight)

		// Get company name
		jobDetails := job{
			Title:       e.ChildText("h2[class=view-job-details__title]"),
			Logo:        trimmedLogo,
			Company:     e.ChildText(".bullet-inline-list > a:first-child"),
			Location:    e.ChildText(".bullet-inline-list > a:nth-child(2)"),
			Type:        e.ChildText(".bullet-inline-list > a:nth-child(3)"),
			Description: e.ChildText("content.text-block__content > span"),
			Link:        e.Request.URL.String(),
			Skills:      skillChecker(e.ChildText("content.text-block__content > span")),
		}
		jobs = append(jobs, jobDetails)
		jobCount++
	})
	// Handle pagination
	c.OnHTML("a.page-link", func(e *colly.HTMLElement) {
		nextPage := e.Attr("href")
		if nextPage != "" {
			fullNextPage := baseUrl + nextPage
			fmt.Println("Visiting next page:", fullNextPage)
			e.Request.Visit(fullNextPage)
		}
	})
	// Visit the initial URL to start scraping
	err := c.Visit("https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&search=developer&paid=true&countryCode=DK&sorting=newJobs")
	if err != nil {
		return err
	}
	return nil
}

func handler(ctx context.Context) ([]job, error) {
	// Check if cache is valid
	if time.Since(lastFetch) < cacheTTL && len(jobs) > 0 {
		return jobs, nil
	}

	// Fetch new data
	err := fetchData()
	if err != nil {
		return nil, err
	}

	// Update cache timestamp
	lastFetch = time.Now()

	return jobs, nil
}

func main() {
	lambda.Start(handler)
}