import requests from bs4 import BeautifulSoup import json import os from datetime import datetime, timedelta # Base URL of the job listings page for software development jobs base_url = 'https://www.jobindex.dk/jobsoegning/it/' # Keywords to include in the job listings include_keywords = ['nextjs', 'svelte', 'react', 'junior', 'python', 'typescript'] # Keywords to exclude from the job titles exclude_keywords = ['senior'] # File path for the JSON database db_file = 'jobs_db.json' # Load existing jobs from the JSON file if it exists if os.path.exists(db_file): try: with open(db_file, 'r') as file: existing_jobs = json.load(file) print(f"Loaded {len(existing_jobs)} existing jobs from {db_file}") except json.JSONDecodeError: print("Error reading JSON file. Starting with an empty list.") existing_jobs = [] else: print("JSON file not found. Starting with an empty list.") existing_jobs = [] # Convert existing jobs to a set of IDs for duplicate checking existing_job_ids = set(job['id'] for job in existing_jobs) # Function to fetch and parse a page def fetch_jobs_from_page(url): response = requests.get(url) if response.status_code != 200: raise Exception(f"Failed to load page {url}") return BeautifulSoup(response.content, 'html.parser') # Function to extract jobs from the parsed HTML def extract_jobs(soup): job_listings = soup.find_all('div', class_='jobsearch-result') jobs = [] today = datetime.today() for job in job_listings: title_element = job.find('h4').find('a') company_element = job.find('div', class_='jix-toolbar-top__company') location_element = job.find('span', class_='jix_robotjob--area') date_posted_element = job.find('time') job_link_element = job.find('h4').find('a') description_element = job.find('p') title = title_element.get_text(strip=True) if title_element else '' url = company_element.find('a', rel='noopener')['href'] if company_element and company_element.find('a', rel='noopener') else '' img = f"https://www.jobindex.dk{job.find('img')['src']}" if job.find('img') else '' company = company_element.find('a').get_text(strip=True) if company_element else '' location = location_element.get_text(strip=True) if location_element else '' date_posted = date_posted_element['datetime'] if date_posted_element else '' job_link = job_link_element['href'] if job_link_element else '' description = description_element.get_text(strip=True) if description_element else '' print(f"Debug: title={title}, url={url}, img={img}, company={company}, location={location}, date_posted={date_posted}, job_link={job_link}, description={description}") # Detailed debug print # Convert date_posted to datetime object try: date_posted_dt = datetime.strptime(date_posted, '%Y-%m-%d') if (today - date_posted_dt).days > 3: print("Job older than 3 days found. Stopping the scraper.") return jobs, False # Returning jobs and False to indicate stopping except ValueError: continue combined_text = f"{title} {description}".lower() if not any(exclude_keyword in title.lower() for exclude_keyword in exclude_keywords) and any(include_keyword in combined_text for include_keyword in include_keywords): job_id = f"{title}-{url}-{location}-{date_posted}" if job_id not in existing_job_ids: jobs.append({ 'id': job_id, 'title': title, 'url': url, 'img': img, 'company': company, 'location': location, 'date_posted': date_posted, 'link': job_link }) existing_job_ids.add(job_id) print(f"Added job: {job_id}") # Debug print for each added job return jobs, True # Function to find the next page URL def get_next_page_url(soup): next_page = soup.find('a', {'aria-label': 'Næste'}) return next_page['href'] if next_page else None # Main scraping loop current_url = base_url all_jobs = [] while current_url: print(f"Fetching jobs from: {current_url}") soup = fetch_jobs_from_page(current_url) jobs, continue_scraping = extract_jobs(soup) all_jobs.extend(jobs) print(f"Collected {len(jobs)} jobs from this page.") if not continue_scraping: break current_url = get_next_page_url(soup) # Combine the existing jobs with the new jobs all_jobs = existing_jobs + all_jobs # Remove jobs older than 30 days from the combined list cutoff_date = datetime.today() - timedelta(days=30) all_jobs = [job for job in all_jobs if datetime.strptime(job['date_posted'], '%Y-%m-%d') >= cutoff_date] # Final debug print before saving print(f"Total jobs to be saved: {len(all_jobs)}") print(f"Jobs to be saved: {all_jobs}") # Debug print to show jobs to be saved # Save the new jobs to the JSON file with open(db_file, 'w') as file: json.dump(all_jobs, file, indent=4) print(f"Total jobs saved: {len(all_jobs)}") # Final output after saving to file