Skip to content

Commit

Permalink
Merge pull request #22 from linkedtales/20-linkedin-updates
Browse files Browse the repository at this point in the history
20 linkedin updates
  • Loading branch information
leonardiwagner authored May 15, 2019
2 parents 88ee423 + 4502b43 commit b448618
Show file tree
Hide file tree
Showing 6 changed files with 48 additions and 22 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "scrapedin",
"version": "1.0.4",
"version": "1.0.5",
"description": "linkedin scraper for 2019 website",
"keywords": [
"linkedin",
Expand Down
22 changes: 19 additions & 3 deletions src/cleanProfileData.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,25 @@
const logger = require('./logger')

module.exports = (profile) => {
if(!profile.profile) {
const messageError = 'LinkedIn website changed and scrapedin can\'t read basic data. Please report this issue at https://github.com/linkedtales/scrapedin/issues'
logger.error('cleanMessageData', messageError, '')
throw new Error(messageError)
}

if(profile.profile.connections) {
profile.profile.connections = profile.profile.connections.replace(' connections', '')

if(profile.profile.connections.indexOf('followers') > -1){
profile.profile.followers = profile.profile.connections
.replace(' followers', '')
.replace(',', '')
}
}

//backward compatibility only
if(profile.about && profile.about.text) {
profile.profile.summary = profile.about.text
}

profile.positions.forEach((position) => {
Expand Down Expand Up @@ -43,8 +62,5 @@ module.exports = (profile) => {
})
}




return profile
}
6 changes: 5 additions & 1 deletion src/profile.js
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ module.exports = async (browser, url, waitTimeToScrapMs = 500) => {
logger.info('profile', `starting scraping url: ${url}`)

const page = await openPage(browser, url)
await page.waitFor("h1[class~='pv-top-card-section__name']", { timeout: 5000 })
const profilePageIndicatorSelector = "ul.pv-top-card-v3--list"
await page.waitFor(profilePageIndicatorSelector, { timeout: 5000 })
.catch(() => {
logger.warn('profile', 'profile selector was not found')
throw new Error('linkedin: profile not found')
Expand All @@ -33,7 +34,9 @@ module.exports = async (browser, url, waitTimeToScrapMs = 500) => {
await new Promise((resolve) => { setTimeout(() => { resolve() }, waitTimeToScrapMs / 2)})
}


const [profile] = await scrapSection(page, template.profile)
const [about] = await scrapSection(page, template.about)
const positions = await scrapSection(page, template.positions)
const educations = await scrapSection(page, template.educations)
const [recommendationsCount] = await scrapSection(page, template.recommendationsCount)
Expand All @@ -49,6 +52,7 @@ module.exports = async (browser, url, waitTimeToScrapMs = 500) => {

const rawProfile = {
profile,
about,
positions,
educations,
skills,
Expand Down
19 changes: 13 additions & 6 deletions src/profileScraperTemplate.js
Original file line number Diff line number Diff line change
@@ -1,12 +1,19 @@
const profileSelector = '.core-rail > * > section:first-child >'

module.exports = {
profile: {
selector: 'section.pv-profile-section.pv-top-card-section',
selector: '.pv-content',
fields: {
name: 'h1[class~=pv-top-card-section__name]',
headline: 'h2[class~=pv-top-card-section__headline]',
location: 'h3[class~=pv-top-card-section__location]',
summary: 'p[class~=pv-top-card-section__summary-text]',
connections: '.pv-top-card-v2-section__connections',
name: `${profileSelector} div:last-child > div:last-child > div:first-child ul:first-child > li:first-child`,
headline: `${profileSelector} div:last-child h2`,
location: `${profileSelector} div:last-child > div:last-child > div:first-child ul:last-child > li:first-child`,
connections: `${profileSelector} div:last-child > div:last-child > div:first-child ul:last-child > li:nth-child(2)`
}
},
about: {
selector: '.pv-about-section',
fields: {
text: '.pv-about__summary-text'
}
},
positions: {
Expand Down
6 changes: 3 additions & 3 deletions src/scrapSection.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@ const scrapSelectorFields = (selector, section) => async (scrapedObjectPromise,
if (!isFieldPresent) { return scrapedObject }

if (field.isMultipleFields) {
scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText))
scrapedObject[fieldKey] = await selector.$$eval(fieldSelectorString, (elems) => elems.map(elem => elem.innerText.trim()))
} else if (field.hasChildrenFields) {
const fieldChildrenSelectors = await selector.$$(field.selector)

scrapedObject[fieldKey] = await Promise.all(
fieldChildrenSelectors.map((s) => scrapSelector(s, field))
)
} else if (field.attribute && field.attribute === 'href') {
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem ? elem.href : '')
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem ? elem.href.trim() : '')
} else {
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem ? elem.innerText : '')
scrapedObject[fieldKey] = await selector.$eval(fieldSelectorString, (elem) => elem ? elem.innerText.trim() : '')
}

return scrapedObject
Expand Down
15 changes: 7 additions & 8 deletions src/scrapedin.js
Original file line number Diff line number Diff line change
Expand Up @@ -3,29 +3,28 @@ const login = require('./login')
const profile = require('./profile')
const logger = require('./logger')

module.exports = ({ email, password, isHeadless, hasToLog, proxyAddress } = { isHeadless: true, hasToLog: false }) => new Promise(async (resolve, reject) => {
module.exports = async({ email, password, isHeadless, hasToLog, puppeteerArgs } = { isHeadless: true, hasToLog: false }) => {
if (!hasToLog) {
logger.stopLogging()
}
logger.info('scrapedin', 'initializing')

if (!email || !password) {
logger.warn('scrapedin', 'required parameters email and password was not provided')
return reject(new Error('scrapedin: email and password are required to access linkedin profiles'))
throw new Error('scrapedin: email and password are required to access linkedin profiles')
}

logger.info('scrapedin', 'required parameters email and password was provided')

const args = proxyAddress && [`--proxy-server=${proxyAddress}`]
args && logger.info('scrapedin', `using proxy address: ${proxyAddress}`)
const browser = await puppeteer.launch({ headless: isHeadless, args })
const args = Object.assign({ headless: isHeadless }, puppeteerArgs)
const browser = await puppeteer.launch(args)

try {
await login(browser, email, password, logger)
} catch (e) {
await browser.close()
return reject(e)
throw e
}

return resolve(Promise.resolve((url, waitMs) => profile(browser, url, waitMs)))
})
return (url, waitMs) => profile(browser, url, waitMs)
}

0 comments on commit b448618

Please sign in to comment.