diff --git a/.gitignore b/.gitignore
index dd569eb..0733327 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,4 @@
/thehub_cache
-/jobs.json
\ No newline at end of file
+/thehub.json
+/itjobbank_cache
+/it-jobbank.json
\ No newline at end of file
diff --git a/main.go b/main.go
index 3449aeb..501816d 100644
--- a/main.go
+++ b/main.go
@@ -12,15 +12,16 @@ import (
)
type job struct {
- Title string `json:"title"`
- Logo string `json:"logo"`
- Company string `json:"company"`
- Location string `json:"location"`
- Type string `json:"type"`
- Description string `json:"description"`
- Link string `json:"link"`
- Skills skills `json:"skills"`
- FirstSeen time.Time `json:"firstSeen"`
+ Title string `json:"title"`
+ Logo string `json:"logo"`
+ Company string `json:"company"`
+ Location string `json:"location"`
+ Type string `json:"type"`
+ Description string `json:"description"`
+ Link string `json:"link"`
+ Skills skills `json:"skills"`
+ FirstSeen time.Time `json:"firstSeen"`
+ Source string `json:"source"`
}
type skills struct {
@@ -33,6 +34,9 @@ type skills struct {
Tailwind bool `json:"tailwind"`
}
+// Utility functions
+
+// Checks if a string contains any of the given keywords
func skillChecker(description string) skills {
return skills{
React: strings.Contains(description, "React"),
@@ -44,7 +48,9 @@ func skillChecker(description string) skills {
Tailwind: strings.Contains(description, "Tailwind"),
}
}
-func jobsToJson(file *os.File) {
+
+// Converts job struct to json
+func jobsToJson(file *os.File, jobs []job, fName string) {
// Encode jobs slice to JSON
encoder := json.NewEncoder(file)
encoder.SetIndent("", " ") // Pretty-print with indentation
@@ -55,24 +61,47 @@ func jobsToJson(file *os.File) {
fmt.Println("Job details successfully written to", fName)
}
+func checkIfPaid(description string) {
+ for _, keyword := range unpaidKeywords {
+ if strings.Contains(strings.ToLower(description), keyword) {
+ return
+ }
+ }
+}
+
+func checkIfStudent(description string) string {
+ for _, keyword := range studentKeywords {
+ if strings.Contains(strings.ToLower(description), keyword) {
+ return "student"
+ }
+ }
+ return "full time"
+}
+
// Slice to store job details
var (
- jobs []job
- jobCount int
- maxJobs int = 30
- fName string = "jobs.json"
- excluded = []string{"senior", "lead", "founder", "cto", "vp of"}
+ excluded = []string{"senior", "lead", "founder", "cto", "vp of", "erfaren", "arkitekt", "architect", "manager", "ulønnet", "unpaid", "praktik"}
+ unpaidKeywords = []string{"unpaid", "praktik", "ulønnet"}
+ studentKeywords = []string{"studerende", "studenter", "student", "medhjælper"}
)
func scrapeHub() {
+ // declare and initialize variables
+ var (
+ jobs []job
+ jobCount int
+ fName = "thehub.json"
+ maxJobs = 30
+ baseUrl = "https://thehub.io"
+ searchString = "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs"
+ file, err = os.Create(fName)
+ )
- file, err := os.Create(fName)
if err != nil {
log.Fatalf("Cannot create file %q: %s", fName, err)
}
defer file.Close()
- baseUrl := "https://thehub.io"
- searchString := "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs"
+
// Instantiate default collector
c := colly.NewCollector(
// visit only the hub
@@ -86,7 +115,9 @@ func scrapeHub() {
detailsCollector := c.Clone()
// On every
element with class "card__content attribute call callback
c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) {
- //ensure only scrape the amount of jobs specified
+ if jobCount >= maxJobs {
+ return
+ }
// Get the title and ensure it doesn't contain any excluded words
title := e.ChildText("span.card-job-find-list__position")
@@ -114,14 +145,14 @@ func scrapeHub() {
cutRight := ");"
trimmedLogo := strings.Trim(logo, cutLeft+cutRight)
- // Get job description
- // Get the HTML of the description
+ // Get the HTML of the description and check to see if it's paid
descriptionHTML, err := e.DOM.Find("content.text-block__content > span").Html()
if err != nil {
log.Printf("Error getting HTML of description: %s", err)
return
}
- // Get company name
+
+ // fill in the job struct
jobDetails := job{
Title: e.ChildText("h2[class=view-job-details__title]"),
Logo: trimmedLogo,
@@ -132,13 +163,14 @@ func scrapeHub() {
Link: e.Request.URL.String(),
Skills: skillChecker(e.ChildText("content.text-block__content > span")),
FirstSeen: time.Now(),
+ Source: baseUrl,
}
jobs = append(jobs, jobDetails)
jobCount++
fmt.Println("Scraped job", jobCount)
- if jobCount == maxJobs {
- jobsToJson(file)
- os.Exit(0)
+ if jobCount >= maxJobs {
+ jobsToJson(file, jobs, fName)
+ return
}
})
// Handle pagination
@@ -146,7 +178,6 @@ func scrapeHub() {
nextPage := e.Attr("href")
if nextPage != "" {
fullNextPage := baseUrl + nextPage
- fmt.Println("Visiting next page:", fullNextPage)
e.Request.Visit(fullNextPage)
}
})
@@ -155,8 +186,103 @@ func scrapeHub() {
}
+func scrapeItJobBank() {
+ // declare and initialize variables
+ var (
+ jobs []job
+ jobCount int
+ fName = "it-jobbank.json"
+ maxJobs = 30
+ baseUrl = "https://www.it-jobbank.dk"
+ searchString = "https://www.it-jobbank.dk/jobsoegning"
+ file, err = os.Create(fName)
+ )
+ if err != nil {
+ log.Fatalf("Cannot create file %q: %s", fName, err)
+ }
+ defer file.Close()
+ // Instantiate default collector
+ c := colly.NewCollector(
+ // visit only the hub
+ colly.AllowedDomains("www.it-jobbank.dk", "it-jobbank.dk"),
+
+ // Cache responses to prevent multiple requests
+ colly.CacheDir("./itjobbank_cache"),
+ )
+
+ // Instantiate a new collector to visit the job details page
+ detailsCollector := c.Clone()
+ // On every
element with class "card__content attribute call callback
+ c.OnHTML("div[class=result]", func(e *colly.HTMLElement) {
+ if jobCount >= maxJobs {
+ return
+ }
+ // Get the title and ensure it doesn't contain any excluded words
+ title := e.ChildText("h3.job-title > a")
+ for _, excludedWord := range excluded {
+ if strings.Contains(strings.ToLower(title), excludedWord) {
+ return
+ }
+ }
+ fullLink := e.ChildAttr("h3.job-title > a", "href")
+
+ detailsCollector.Visit(fullLink)
+ })
+
+ detailsCollector.OnRequest(func(r *colly.Request) {
+ fmt.Println("Visiting", r.URL.String())
+ })
+
+ detailsCollector.OnHTML("section > div", func(e *colly.HTMLElement) {
+ // get the description as html
+ descriptionHTML, err := e.DOM.Find("div[id=job_ad]").Html()
+ if err != nil {
+ log.Printf("Error getting HTML of description: %s", err)
+ return
+ }
+ // Check if the job is paid
+ checkIfPaid(descriptionHTML)
+ // fill in the job struct
+ title := e.ChildText("h1.title")
+ if title == "" {
+ title = e.ChildText("h1[id=jobtitle]")
+ }
+ jobDetails := job{
+ Title: title,
+ Logo: baseUrl + e.ChildAttr("div.companmony-logo > img", "src"),
+ Company: e.ChildText("p.published"),
+ Location: e.ChildText("div.job-location > p.caption"),
+ Type: checkIfStudent(descriptionHTML),
+ Description: descriptionHTML,
+ Link: e.Request.URL.String(),
+ Skills: skillChecker(descriptionHTML),
+ FirstSeen: time.Now(),
+ Source: baseUrl,
+ }
+ jobs = append(jobs, jobDetails)
+ jobCount++
+ fmt.Println("Scraped job", jobCount)
+ if jobCount >= maxJobs {
+ jobsToJson(file, jobs, fName)
+ return
+ }
+ })
+ // Handle pagination
+ c.OnHTML("a.page-link", func(e *colly.HTMLElement) {
+ if jobCount >= maxJobs {
+ return
+ }
+ nextPage := e.Attr("href")
+ if nextPage != "" {
+ e.Request.Visit(nextPage)
+ }
+ })
+
+ c.Visit(searchString)
+}
func main() {
scrapeHub()
+ scrapeItJobBank()
}