sw-jobs-api/scrapers/jobindex.py
Christian f542ff8aa2
Some checks failed
Build and Push Docker Images / build (push) Failing after 10s
slimmed repo down to scraper and api.
2024-06-07 09:22:40 +00:00

129 lines
5.2 KiB
Python

import requests
from bs4 import BeautifulSoup
import json
import os
from datetime import datetime, timedelta
# Base URL of the job listings page for software development jobs
base_url = 'https://www.jobindex.dk/jobsoegning/it/'
# Keywords to include in the job listings
include_keywords = ['nextjs', 'svelte', 'react', 'junior', 'python', 'typescript']
# Keywords to exclude from the job titles
exclude_keywords = ['senior']
# File path for the JSON database
db_file = 'jobs_db.json'
# Load existing jobs from the JSON file if it exists
if os.path.exists(db_file):
try:
with open(db_file, 'r') as file:
existing_jobs = json.load(file)
print(f"Loaded {len(existing_jobs)} existing jobs from {db_file}")
except json.JSONDecodeError:
print("Error reading JSON file. Starting with an empty list.")
existing_jobs = []
else:
print("JSON file not found. Starting with an empty list.")
existing_jobs = []
# Convert existing jobs to a set of IDs for duplicate checking
existing_job_ids = set(job['id'] for job in existing_jobs)
# Function to fetch and parse a page
def fetch_jobs_from_page(url):
response = requests.get(url)
if response.status_code != 200:
raise Exception(f"Failed to load page {url}")
return BeautifulSoup(response.content, 'html.parser')
# Function to extract jobs from the parsed HTML
def extract_jobs(soup):
job_listings = soup.find_all('div', class_='jobsearch-result')
jobs = []
today = datetime.today()
for job in job_listings:
title_element = job.find('h4').find('a')
company_element = job.find('div', class_='jix-toolbar-top__company')
location_element = job.find('span', class_='jix_robotjob--area')
date_posted_element = job.find('time')
job_link_element = job.find('h4').find('a')
description_element = job.find('p')
title = title_element.get_text(strip=True) if title_element else ''
url = company_element.find('a', rel='noopener')['href'] if company_element and company_element.find('a', rel='noopener') else ''
img = f"https://www.jobindex.dk{job.find('img')['src']}" if job.find('img') else ''
company = company_element.find('a').get_text(strip=True) if company_element else ''
location = location_element.get_text(strip=True) if location_element else ''
date_posted = date_posted_element['datetime'] if date_posted_element else ''
job_link = job_link_element['href'] if job_link_element else ''
description = description_element.get_text(strip=True) if description_element else ''
print(f"Debug: title={title}, url={url}, img={img}, company={company}, location={location}, date_posted={date_posted}, job_link={job_link}, description={description}") # Detailed debug print
# Convert date_posted to datetime object
try:
date_posted_dt = datetime.strptime(date_posted, '%Y-%m-%d')
if (today - date_posted_dt).days > 3:
print("Job older than 3 days found. Stopping the scraper.")
return jobs, False # Returning jobs and False to indicate stopping
except ValueError:
continue
combined_text = f"{title} {description}".lower()
if not any(exclude_keyword in title.lower() for exclude_keyword in exclude_keywords) and any(include_keyword in combined_text for include_keyword in include_keywords):
job_id = f"{title}-{url}-{location}-{date_posted}"
if job_id not in existing_job_ids:
jobs.append({
'id': job_id,
'title': title,
'url': url,
'img': img,
'company': company,
'location': location,
'date_posted': date_posted,
'link': job_link
})
existing_job_ids.add(job_id)
print(f"Added job: {job_id}") # Debug print for each added job
return jobs, True
# Function to find the next page URL
def get_next_page_url(soup):
next_page = soup.find('a', {'aria-label': 'Næste'})
return next_page['href'] if next_page else None
# Main scraping loop
current_url = base_url
all_jobs = []
while current_url:
print(f"Fetching jobs from: {current_url}")
soup = fetch_jobs_from_page(current_url)
jobs, continue_scraping = extract_jobs(soup)
all_jobs.extend(jobs)
print(f"Collected {len(jobs)} jobs from this page.")
if not continue_scraping:
break
current_url = get_next_page_url(soup)
# Combine the existing jobs with the new jobs
all_jobs = existing_jobs + all_jobs
# Remove jobs older than 30 days from the combined list
cutoff_date = datetime.today() - timedelta(days=30)
all_jobs = [job for job in all_jobs if datetime.strptime(job['date_posted'], '%Y-%m-%d') >= cutoff_date]
# Final debug print before saving
print(f"Total jobs to be saved: {len(all_jobs)}")
print(f"Jobs to be saved: {all_jobs}") # Debug print to show jobs to be saved
# Save the new jobs to the JSON file
with open(db_file, 'w') as file:
json.dump(all_jobs, file, indent=4)
print(f"Total jobs saved: {len(all_jobs)}") # Final output after saving to file