-
Notifications
You must be signed in to change notification settings - Fork 0
/
index.js
168 lines (166 loc) · 6.78 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
import puppeteer from 'puppeteer-extra';
import StealthPlugin from 'puppeteer-extra-plugin-stealth';
import fs from 'fs';
import { stringify } from 'csv-stringify';
// import { executablePath } from 'puppeteer';
async function checkAndWaitForAccessDeniedPage(page) {
const checkDenied = await page.waitForXPath(
'//*[contains(text(), "Please verify you are a human")]',
5000
);
if (checkDenied) {
await checkAndWaitForAccessDeniedPage(page);
}
return true;
}
(async () => {
try {
puppeteer.use(StealthPlugin());
const InvestorUrl =
'https://www.crunchbase.com/organization/uphonest-capital/recent_investments/investments';
const InvestorCompanyName = 'Uphonest Capital';
const filename = './data/uphonest-capital.csv';
const browserWSEndpoint =
'ws://127.0.0.1:9222/devtools/browser/264ea4ec-d1d7-43db-bf45-4354fb0ccd7e';
const browser = await puppeteer.connect({
browserWSEndpoint,
});
const page = await browser.newPage();
page.setDefaultNavigationTimeout(0);
const writableStream = fs.createWriteStream(filename);
const columns = [
'InvestorCompany',
'InvestedCompanyName',
'CompanyCurnchbaseUrl',
'Industries',
'FounderName',
'FounderLinkedIn',
];
const stringifier = stringify({ header: true, columns: columns });
stringifier.write({
InvestorCompany: InvestorCompanyName,
});
stringifier.pipe(writableStream);
await page.goto(InvestorUrl);
// TODO: check if access denied page
await checkAndWaitForAccessDeniedPage(page);
// Table selector for list of recent investments
const investmentTableSelector =
'body > chrome > div > mat-sidenav-container > mat-sidenav-content > div > ng-component > entity-v2 > page-layout > div > div > div > div > div > page-centered-layout:nth-child(2) > div > profile-section > section-card > mat-card > div.section-content-wrapper > div > list-card > div > table > tbody';
// Total loaded row count for investment table
const investmentRowCount = await page.$$eval(
`${investmentTableSelector} > tr`,
(el) => el.length
);
// set initial data object to set all data into
const data = {
InvestorUrl,
investments: [],
};
// Set Invested companies list
const investedCompanies = [];
console.log('Total invested companies list loaded: ', investmentRowCount);
for (let i = 1; i < investmentRowCount + 1; i++) {
const companyName = await page.evaluate(
(el) => el.innerText,
await page.$(
`${investmentTableSelector} > tr:nth-child(${i}) > td:nth-child(2) > field-formatter > identifier-formatter > a div.identifier-label`
)
);
const companyLink = await page.evaluate(
(el) => el.href,
await page.$(
`${investmentTableSelector} > tr:nth-child(${i}) > td:nth-child(2) > field-formatter > identifier-formatter > a`
)
);
investedCompanies.push({ companyName, companyLink });
}
await new Promise((_func) => setTimeout(_func, 5000));
// iterate through all company to get required data
for (const company of investedCompanies) {
// load company link
await page.goto(company.companyLink, {
waitUntil: 'domcontentloaded',
});
// TODO: check if access denied page
await checkAndWaitForAccessDeniedPage(page);
// get industries and founders
const industries = [];
const founders = [];
// Selector path for industry and founders
const industrySelector =
'body > chrome > div > mat-sidenav-container > mat-sidenav-content > div > ng-component > entity-v2 > page-layout > div > div > div > div > page-centered-layout > div > div > div.main-content > row-card:nth-child(1) > profile-section > section-card > mat-card > div.section-content-wrapper > div > fields-card:nth-child(1) > ul > li:nth-child(1) > field-formatter > identifier-multi-formatter > span > chips-container';
const foundersSelector =
'body > chrome > div > mat-sidenav-container > mat-sidenav-content > div > ng-component > entity-v2 > page-layout > div > div > div > div > page-centered-layout > div > div > div.main-content > row-card:nth-child(1) > profile-section > section-card > mat-card > div.section-content-wrapper > div > fields-card:nth-child(1) > ul > li:nth-child(4) > field-formatter > identifier-multi-formatter > span';
// get count for industries
const industryCount = await page.$$eval(
`${industrySelector} > a`,
(el) => el.length
);
// iterate through list to get industry name
for (let i = 1; i < industryCount + 1; i++) {
const industryName = await page.evaluate(
(el) => el.innerText,
await page.$(`${industrySelector} > a:nth-child(${i}) div.chip-text`)
);
industries.push(industryName);
}
stringifier.write({
InvestorCompany: '',
InvestedCompanyName: company.companyName,
CompanyCurnchbaseUrl: company.companyLink,
Industries: industries.join(', '),
});
// get count for founders
const foundersCount = await page.$$eval(
`${foundersSelector} > a`,
(el) => el.length
);
// iterate through list to get founders name and link
for (let i = 1; i < foundersCount + 1; i++) {
const founderName = await page.evaluate(
(el) => el.innerText,
await page.$(`${foundersSelector} > a:nth-child(${i})`)
);
const founderLink = await page.evaluate(
(el) => el.href,
await page.$(`${foundersSelector} > a:nth-child(${i})`)
);
founders.push({ founderName, founderLink });
}
let i = 0;
await new Promise((_func) => setTimeout(_func, 5000));
for (const founder of founders) {
await page.goto(founder.founderLink, {
waitUntil: 'domcontentloaded',
});
await checkAndWaitForAccessDeniedPage(page);
const founderLinkedinLink = await page.evaluate(
(el) => el?.href,
await page.$(`a[title="View on LinkedIn"]`) // todo update selector to get linked by title attribute
);
founders[i].founderLinkedinLink = founderLinkedinLink;
stringifier.write({
InvestorCompany: '',
InvestedCompanyName: '',
Industries: '',
FounderName: founder.founderName,
FounderLinkedIn: founderLinkedinLink,
});
await new Promise((_func) => setTimeout(_func, 5000));
i++;
}
// push all details to data
data.investments.push({
...company,
industries,
founders,
});
await new Promise((_func) => setTimeout(_func, 5000));
}
console.log(JSON.stringify(data));
//await browser.close();
} catch (e) {
console.log(e);
}
})();