Skip to content

Commit

Permalink
Use a server
Browse files Browse the repository at this point in the history
  • Loading branch information
jvandenaardweg committed Jan 4, 2019
1 parent 1a6d4db commit 34d7ef6
Show file tree
Hide file tree
Showing 8 changed files with 3,232 additions and 337 deletions.
8 changes: 8 additions & 0 deletions .editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
root = true

[*]
end_of_line = lf
insert_final_newline = true
charset = utf-8
indent_style = space
indent_size = 2
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
.env
node_modules
node_modules
/user_data/*
2 changes: 2 additions & 0 deletions Procfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
web: npm start
worker: npm run worker
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,8 @@ LinkedIn profile scraper using Puppeteer headless browser. So you can use it on
1. Create a `.env` file in the root of this project
2. Fill it with `LINKEDIN_LOGIN_EMAIL="your-linkedin-email@email.com"` and `LINKEDIN_LOGIN_PASSWORD="your-linkedin-password"`
3. Run `npm start`
4. Wait for the data to come in
4. Wait for the data to come in


# Usage limits
Read: [LinkedIn Commercial Use Limit](https://www.linkedin.com/help/linkedin/answer/52950)
201 changes: 21 additions & 180 deletions index.js
Original file line number Diff line number Diff line change
@@ -1,182 +1,23 @@
require('dotenv').config()

const puppeteer = require('puppeteer');

const exampleProfileUrl = 'https://www.linkedin.com/in/jvandenaardweg/';

(async () => {

const browser = await puppeteer.launch({
headless: false
});

const page = await browser.newPage();

console.log('Browsing to LinkedIn.com in the background using a headless browser...');
await page.setViewport({width: 1200, height: 720})
await page.goto('https://www.linkedin.com/', { waitUntil: 'domcontentloaded' }); // wait until page load

console.log('Logging in with the credentials...')
await page.type('#login-email', process.env.LINKEDIN_LOGIN_EMAIL);
await page.type('#login-password', process.env.LINKEDIN_LOGIN_PASSWORD);

await Promise.all([
await page.click('#login-submit'),
await page.waitForNavigation({ waitUntil: 'domcontentloaded' })
]);

console.log('Logged in!');

console.log('Navigating to LinkedIn profile...');

await page.goto(exampleProfileUrl, { waitUntil: 'domcontentloaded' });

console.log('LinkedIn profile page loaded!');

console.log('Getting all the LinkedIn profile data by scrolling the page to the bottom, so all the data gets loaded into the page...');

await autoScroll(page);

console.log('Parsing data...');

// Clicking the "Show X more experiences" button
// TODO: make dynamic, check if exists, check if need to be clicked again
await page.click('#experience-section .pv-profile-section__see-more-inline.link'); // Expand "Experience"
await page.click('[data-control-name="skill_details"]'); // Expand "Skills" section
await page.click('#education-section .pv-profile-section__text-truncate-toggle.link'); // Expand "Education" section
await page.waitFor(1000);

const userProfile = await page.evaluate(() => {
const regexRemoveMultipleSpaces = / +/g
const regexRemoveLineBreaks = /(\r\n\t|\n|\r\t)/gm

const profileSection = document.querySelector('.pv-profile-section');

const fullNameElement = profileSection.querySelector('.pv-top-card-section__name');
const titleElement = profileSection.querySelector('.pv-top-card-section__headline');
const locationElement = profileSection.querySelector('.pv-top-card-section__location');
const photoElement = profileSection.querySelector('img.profile-photo-edit__preview');
const descriptionElement = profileSection.querySelector('.pv-top-card-section__summary-text');
const followersCountElement = document.querySelector('.pv-recent-activity-section__follower-count--text');
const connectionsCountElement = document.querySelector('.pv-top-card-v2-section__connections');

return {
fullName: (fullNameElement && fullNameElement.textContent) ? fullNameElement.textContent.trim() : null,
title: (titleElement && titleElement.textContent) ? titleElement.textContent.trim() : null,
location: (locationElement && locationElement.textContent) ? locationElement.textContent.trim() : null,
photo: (photoElement) ? photoElement.getAttribute('src') : null,
description: (descriptionElement && descriptionElement.textContent) ? descriptionElement.textContent.replace(regexRemoveLineBreaks, '').replace(regexRemoveMultipleSpaces, ' ').trim() : null,
url: window.location.href,
followersTotal: (followersCountElement && followersCountElement.textContent) ? followersCountElement.textContent.trim() : null,
connectionsTotal: (connectionsCountElement && connectionsCountElement.textContent) ? connectionsCountElement.textContent.trim() : null
}
});

console.log('User:');
console.log(userProfile);

console.log('Parsing experiences data...');

const experiences = await page.$$eval('#experience-section ul > .ember-view', nodes => {
// Note: the $$eval context is the browser context.
// So custom methods you define in this file are not available within this $$eval.
return nodes.map((node) => {
const title = node.querySelector('h3');
const company = node.querySelector('.pv-entity__secondary-title');
const dateRange = node.querySelector('.pv-entity__date-range span:nth-child(2)').textContent;
const startDate = (dateRange) ? dateRange.split('–')[0] : null;
const endDate = (dateRange) ? dateRange.split('–')[1] : null;
const duration = node.querySelector('.pv-entity__bullet-item-v2');
const location = node.querySelector('.pv-entity__location span:nth-child(2)');

return {
title: (title && title.textContent) ? title.textContent.trim() : null,
company: (company && company.textContent) ? company.textContent.trim() : null,
dateRange: (dateRange) ? dateRange.trim() : null,
startDate: (startDate) ? startDate.trim() : null,
endDate: (endDate) ? endDate.trim() : null,
duration: (duration && duration.textContent) ? duration.textContent.trim() : null,
location: (location && location.textContent) ? location.textContent.trim() : null
}
})
});

console.log('Got experiences data:');
console.log(experiences);

const education = await page.$$eval('#education-section ul > .ember-view', nodes => {
// Note: the $$eval context is the browser context.
// So custom methods you define in this file are not available within this $$eval.

return nodes.map((node) => {
const schoolName = node.querySelector('h3.pv-entity__school-name');
const degreeName = node.querySelector('.pv-entity__degree-name .pv-entity__comma-item');
const fieldOfStudy = node.querySelector('.pv-entity__fos .pv-entity__comma-item');
const grade = node.querySelector('.pv-entity__grade .pv-entity__comma-item');
const dateRange = node.querySelectorAll('.pv-entity__dates time');
const startDate = dateRange[0];
const endDate = dateRange[1];

return {
schoolName: (schoolName) ? schoolName.textContent.trim() : null,
degreeName: (degreeName) ? degreeName.textContent.trim() : null,
fieldOfStudy: (fieldOfStudy) ? fieldOfStudy.textContent.trim() : null,
startDate: (startDate) ? startDate.textContent.trim() : null,
endDate: (endDate) ? endDate.textContent.trim() : null,
}
})
});

console.log('Got education data:');
console.log(education);



const skills = await page.$$eval('.pv-skill-categories-section ol > .ember-view', nodes => {
// Note: the $$eval context is the browser context.
// So custom methods you define in this file are not available within this $$eval.

return nodes.map((node) => {
const skillName = node.querySelector('.pv-skill-category-entity__name-text');
const endorsementCount = node.querySelector('.pv-skill-category-entity__endorsement-count');

return {
skillName: (skillName) ? skillName.textContent.trim() : null,
endorsementCount: (endorsementCount) ? endorsementCount.textContent.trim() : 0
}
})
});

console.log('Got skills data:');
console.log(skills);



console.log('WE ARE DONE! GOODBYE!');

await browser.close();

page.on("error", function (err) {
theTempValue = err.toString();
console.log("Error: " + theTempValue);
const express = require('express')
const app = express()
const port = process.env.PORT || 3000

// TODO: this should be a worker process
// We should send an event to the worker process and wait for an update
// So this server can handle more concurrent connections
const { getLinkedinProfileDetails } = require('./scraper/linkedin')

app.get('/', async (req, res) => {
const urlToScrape = req.query.url

if (urlToScrape && urlToScrape.includes('linkedin.com/')) {
const linkedinProfileDetails = await getLinkedinProfileDetails(urlToScrape)
res.json({ ...linkedinProfileDetails })
} else {
res.json({
message: 'Missing the url parameter. Or given URL is not an LinkedIn URL.'
})
})();

async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 200;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
}
})

if(totalHeight >= scrollHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}
app.listen(port, () => console.log(`Example app listening on port ${port}!`))
Loading

0 comments on commit 34d7ef6

Please sign in to comment.