forked from rannes.dev/sw-jobs-scraper
		
	Compare commits
	
		
			2 Commits
		
	
	
		
			c20690f323
			...
			5fbdd9706e
		
	
	| Author | SHA1 | Date | |
|---|---|---|---|
| 5fbdd9706e | |||
| 20511db91f | 
							
								
								
									
										49
									
								
								scraper.go
									
									
									
									
									
								
							
							
						
						
									
										49
									
								
								scraper.go
									
									
									
									
									
								
							| @ -4,6 +4,7 @@ import ( | ||||
| 	"context" | ||||
| 	"fmt" | ||||
| 	"strings" | ||||
| 	"time" | ||||
| 
 | ||||
| 	"github.com/aws/aws-lambda-go/lambda" | ||||
| 	"github.com/gocolly/colly" | ||||
| @ -29,6 +30,13 @@ type skills struct { | ||||
| 	Typescript bool `json:"typescript"` | ||||
| } | ||||
| 
 | ||||
| var ( | ||||
| 	jobs      []job | ||||
| 	lastFetch time.Time | ||||
| 	cacheTTL  = time.Minute * 5 | ||||
| 	jobLimit  = 20 | ||||
| ) | ||||
| 
 | ||||
| func skillChecker(description string) skills { | ||||
| 	return skills{ | ||||
| 		React:      strings.Contains(description, "React"), | ||||
| @ -40,13 +48,9 @@ func skillChecker(description string) skills { | ||||
| 	} | ||||
| } | ||||
| 
 | ||||
| // Slice to store job details | ||||
| var jobs []job | ||||
| 
 | ||||
| func handler(ctx context.Context) ([]job, error) { | ||||
| func fetchData() error { | ||||
| 
 | ||||
| 	baseUrl := "https://thehub.io" | ||||
| 	searchString := "https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&roles=devops&paid=true&countryCode=DK&sorting=newJobs" | ||||
| 	// Instantiate default collector | ||||
| 	c := colly.NewCollector( | ||||
| 		// visit only the hub | ||||
| @ -60,8 +64,16 @@ func handler(ctx context.Context) ([]job, error) { | ||||
| 	excluded := []string{"senior", "lead"} | ||||
| 	// Instantiate a new collector to visit the job details page | ||||
| 	detailsCollector := c.Clone() | ||||
| 
 | ||||
| 	// Limit the number of jobs to fetch | ||||
| 	jobCount := 0 | ||||
| 
 | ||||
| 	// On every <div> element with class "card__content attribute call callback | ||||
| 	c.OnHTML("div[class=card__content]", func(e *colly.HTMLElement) { | ||||
| 		// Return if the job limit has been reached | ||||
| 		if jobCount >= jobLimit { | ||||
| 			return | ||||
| 		} | ||||
| 		// Get the title and ensure it doesn't contain any excluded words | ||||
| 		title := e.ChildText("span.card-job-find-list__position") | ||||
| 		for _, excludedWord := range excluded { | ||||
| @ -80,6 +92,9 @@ func handler(ctx context.Context) ([]job, error) { | ||||
| 	}) | ||||
| 
 | ||||
| 	detailsCollector.OnHTML("div.view-job-details", func(e *colly.HTMLElement) { | ||||
| 		if jobCount >= jobLimit { | ||||
| 			return | ||||
| 		} | ||||
| 		// Get logo and trim the url | ||||
| 
 | ||||
| 		logo := e.ChildAttr("div.media-item__image", "style") | ||||
| @ -99,6 +114,7 @@ func handler(ctx context.Context) ([]job, error) { | ||||
| 			Skills:      skillChecker(e.ChildText("content.text-block__content > span")), | ||||
| 		} | ||||
| 		jobs = append(jobs, jobDetails) | ||||
| 		jobCount++ | ||||
| 	}) | ||||
| 	// Handle pagination | ||||
| 	c.OnHTML("a.page-link", func(e *colly.HTMLElement) { | ||||
| @ -109,8 +125,29 @@ func handler(ctx context.Context) ([]job, error) { | ||||
| 			e.Request.Visit(fullNextPage) | ||||
| 		} | ||||
| 	}) | ||||
| 	// Visit the initial URL to start scraping | ||||
| 	err := c.Visit("https://thehub.io/jobs?roles=frontenddeveloper&roles=fullstackdeveloper&roles=backenddeveloper&search=developer&paid=true&countryCode=DK&sorting=newJobs") | ||||
| 	if err != nil { | ||||
| 		return err | ||||
| 	} | ||||
| 	return nil | ||||
| } | ||||
| 
 | ||||
| func handler(ctx context.Context) ([]job, error) { | ||||
| 	// Check if cache is valid | ||||
| 	if time.Since(lastFetch) < cacheTTL && len(jobs) > 0 { | ||||
| 		return jobs, nil | ||||
| 	} | ||||
| 
 | ||||
| 	// Fetch new data | ||||
| 	err := fetchData() | ||||
| 	if err != nil { | ||||
| 		return nil, err | ||||
| 	} | ||||
| 
 | ||||
| 	// Update cache timestamp | ||||
| 	lastFetch = time.Now() | ||||
| 
 | ||||
| 	c.Visit(searchString) | ||||
| 	return jobs, nil | ||||
| } | ||||
| 
 | ||||
|  | ||||
		Loading…
	
	
			
			x
			
			
		
	
		Reference in New Issue
	
	Block a user