-
Notifications
You must be signed in to change notification settings - Fork 5
/
evaluating_data_inclusion.html
413 lines (346 loc) · 23.3 KB
/
evaluating_data_inclusion.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="theme-color" content= "#24292e">
<meta name="ha-url" content="https://collector.githubapp.com/github-social-impact/collect">
<link rel="canonical" href="https://socialimpact.github.com/developers/intro-to-open-source/">
<link rel="alternate" type="application/rss+xml" title="GitHub Social Impact" href="/feed.xml">
<link rel="stylesheet" href="https://use.typekit.net/fhk4uct.css">
<link rel="stylesheet" media="screen" href="assets/css/index.css" />
<link rel="shortcut icon" type="image/ico" href="assets/img/favicons/favicon.png" />
<link rel="apple-touch-icon" href="assets/img/favicons/apple-touch-icon.png" />
<link rel="icon" href="assets/img/favicons/android-favicon.png">
<title> DataKind Playbook </title>
<meta name="generator" content="Jekyll v3.9.5" />
<meta property="og:title" content="Introduction to Open Source for Nonprofits and Social Sector Organizations" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="The Social Impact team uses GitHub’s product, brand and employees to empower nonprofits and the greater social sector to make a positive & lasting contribution to the world." />
<meta property="og:description" content="The Social Impact team uses GitHub’s product, brand and employees to empower nonprofits and the greater social sector to make a positive & lasting contribution to the world." />
<link rel="canonical" href="https://socialimpact.github.com/developers/intro-to-open-source/" />
<meta property="og:url" content="https://socialimpact.github.com/developers/intro-to-open-source/" />
<meta property="og:site_name" content="GitHub Social Impact" />
<meta property="og:image" content="datakind/playbook/assets/img/DK_playbook_org.svg" />
<meta property="og:type" content="website" />
<meta name="twitter:card" content="summary_large_image" />
<meta property="twitter:image" content="https://socialimpact.github.com/assets/img/social-card.png" />
<meta property="twitter:title" content="Introduction to Open Source for Nonprofits and Social Sector Organizations" />
<meta name="twitter:site" content="@github" />
<script type="application/ld+json">
{"@context":"https://schema.org","@type":"WebPage","description":"The Social Impact team uses GitHub’s product, brand and employees to empower nonprofits and the greater social sector to make a positive & lasting contribution to the world.","headline":"Introduction to Open Source for Nonprofits and Social Sector Organizations","url":"https://socialimpact.github.com/developers/intro-to-open-source/"}</script>
<style type="text/css">
*[data-animate-in="up"],
*[data-animate-in="fade"],
*[data-animate-in="left"],
*[data-animate-in="right"] {
opacity: 0;
}
</style>
<!--style for inserting image-->
<title>DataKind Project Process</title>
<style>
.content-container {
max-width: 600px; /* Adjust this to set the desired width of the content */
margin: 20px auto; /* Centers the container on the page */
text-align: left; /* Aligns the text to the left */
}
.image-container img {
width: 100%; /* Makes the image stretch to the width of the container */
height: auto; /* Maintains the aspect ratio */
display: block; /* Ensures the image does not sit inline with text */
margin-top: 20px; /* Adds space above the image */
}
</style>
<!--subbullent point-->
<style>
/* General styling for the list */
ul.custom-bullets {
padding-left: 20px; /* Adjust padding to align the list inside its container */
title: "Project Folder List"; /* Title attribute for additional context */
}
/* Style for individual list items */
ul.custom-bullets li {
list-style-type: none; /* Removes default list styling */
position: relative; /* Positions the pseudo-element relative to the list items */
margin-bottom: 10px; /* Adds space between list items */
}
/* Pseudo-element for custom bullet */
ul.custom-bullets li::before {
content: "○"; /* Hollow circle symbol */
position: absolute; /* Positions the pseudo-element absolutely within the relative parent */
left: -20px; /* Moves the bullet to the left of the list item text */
top: 0; /* Aligns the bullet with the top of the text */
}
</style>
<script src="assets/js/jquery.min.js"></script>
<!----->
<style type="text/css">
*[data-animate-in="up"],
*[data-animate-in="fade"],
*[data-animate-in="left"],
*[data-animate-in="right"] {
opacity: 0;
}
</style>
<script src="assets/js/jquery.min.js"></script>
</head>
<body class="page-loading no-js developers-intro" id="page-developers-intro">
<a class="skip-to-content" href="#content">skip to content</a>
<header class="site-header text-black" id="site-header">
<div class="container-wide">
<div class="site-branding">
<a href="https://datakind.github.io">
<img src="assets/img/DK_playbook_org.svg" alt="DataKind logo logo" />
<span class="sr-only">Return to homepage</span>
</a>
</div>
<ul class="desktop-navigation" style="color:black">
<li class="has-sub-nav">
<a href="projectstages.html">
<span>Project Stages</span>
<svg height="16" class="octicon octicon-chevron-down" viewBox="0 0 16 16" version="1.1" width="16" aria-hidden="true"><path fill-rule="evenodd" d="M12.78 6.22a.75.75 0 010 1.06l-4.25 4.25a.75.75 0 01-1.06 0L3.22 7.28a.75.75 0 011.06-1.06L8 9.94l3.72-3.72a.75.75 0 011.06 0z"></path></svg>
</a>
<ul class="sub-nav">
<li>
<a href="discovery.html">Discovery</a>
</li>
<li>
<a href="design.html">Design</a>
</li>
<li>
<a href="prepare.html">Prepare</a>
</li>
<li>
<a href="execute.html">Execute</a>
</li>
<li>
<a href="share.html">Share</a>
</li>
<li>
<a href="evaluate.html">Evaluate</a>
</li>
</ul>
</li><li>
</li>
</li>
<li class="has-sub-nav">
<a href="SIOs.html">
<span>Social Impact Organizations</span>
<svg height="16" class="octicon octicon-chevron-down" viewBox="0 0 16 16" version="1.1" width="16" aria-hidden="true"><path fill-rule="evenodd" d="M12.78 6.22a.75.75 0 010 1.06l-4.25 4.25a.75.75 0 01-1.06 0L3.22 7.28a.75.75 0 011.06-1.06L8 9.94l3.72-3.72a.75.75 0 011.06 0z"></path></svg>
</a>
<ul class="sub-nav">
<li>
<a href="scoping.html">Data Project Scoping</a>
</li>
<li>
<a href="data_engineering.html">Data Engineering</a>
</li>
<li>
<a href="genai.html">Generative AI</a>
</li>
<li>
<a href="data_governance.html">Data Governance</a>
</li>
</ul>
</li>
<li class="has-sub-nav">
<a href="/Volunteers.html">
<span>Volunteers</span>
<svg height="16" class="octicon octicon-chevron-down" viewBox="0 0 16 16" version="1.1" width="16" aria-hidden="true"><path fill-rule="evenodd" d="M12.78 6.22a.75.75 0 010 1.06l-4.25 4.25a.75.75 0 01-1.06 0L3.22 7.28a.75.75 0 011.06-1.06L8 9.94l3.72-3.72a.75.75 0 011.06 0z"></path></svg>
</a>
<ul class="sub-nav">
<li>
<a href="Volunteers.html">Getting Started</a>
</li>
<li>
<a href="donate.html">Donate</a>
</li>
<li>
<a href="chapter_operations.html">Chapter Operations</a>
</li>
<li>
<a href="volunteer_management.html">Volunteer Management</a>
</li>
</ul>
</li>
</ul>
<div class="burger js-menu-trigger">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 4H21V6H3V4ZM3 11H21V13H3V11ZM3 18H21V20H3V18Z" fill="rgba(255,255,255,1)"></path></svg>
</div>
</div>
</header>
<!---
</head>
<body class="page-loading no-js insights" id="page-developers-intro">
<a class="skip-to-content" href="#content">skip to content</a>
<header class="site-header" id="site-header">
<div class="container-wide">
<div class="site-branding">
<a href="/">
<img src="assets/img/DK_playbook_org.svg" alt="DataKind Playbook logo" />
<span class="sr-only">Return to homepage</span>
</a>
</div>
<ul class="desktop-navigation">
<li class="text-black">
<a href="discovery.html">Back to Discovery Articles</a>
</li>
</ul>
<nav class="navigation-drawer">
<div>
<ul class="mobile-navigation">
<li class="text-black">
<a href="discovery.html">Back to Discovery Articles</a>
</li>
</ul>
</div>
</nav>
<div class="burger js-menu-trigger">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24"><path d="M3 4H21V6H3V4ZM3 11H21V13H3V11ZM3 18H21V20H3V18Z" fill="rgba(255,255,255,1)"></path></svg>
</div>
</div>
</header>
-->
<main id="content" class="main page-content" aria-label="Content">
<article>
<div class="wrapper mt-6 pt-6 pt-md-4 pb-4">
<div class="skill-level">
<p>
<b>Intended audience:</b>
<span>DataKind Volunteers</span>
</p>
</div>
</div>
<div class="wrapper-thin-alt py-6 mb-6 rte long-form">
<h4>Evaluating Data Inclusion</h4>
<p>Data inclusion ensures the data are representative and are not missing any people, variables, communities, or indicators. Evaluating possible data inclusion risks means understanding the assumptions that are made when collecting, curating, or tagging data by evaluating the abstract notions of data, what’s included, and decisions that were made in collection. </p>
<p>At DataKind, we never build algorithms where the user provides an input and receives an output, but without being able to see or otherwise understand the inner workings of the algorithm that produced the output. We avoid situations where variables like race, gender, ability, sexual orientation, among others - including their proxies - cannot be tested in a transparent way. Instead, we opt for interpretable algorithms when it comes to people. Proxies can be identified if domain experts can “guess” at protected characteristics or PII from the information. If this data is missing from your dataset, it might make it difficult to evaluate your model for underlying and unintended bias. How will you know you have an algorithm that excludes women, if you don’t test your model for gender inclusivity? Make sure you have the variables you need in your data set to test for inclusivity, but ensure that your plan does not include using these variables in a model that cannot be interpreted. </p>
<p>An example of data exclusion is health outcomes data in <a href="https://dhis2.org/">DHIS2</a>, a health software used for data tracking. Consider an instance in which only data from health facilities in Kenya were reported. These data may represent certain groups within the Kenyan population, but only for those who have access to healthcare. That is, these data only include formal healthcare transactions and not from informal cases in rural populations. Another example is that data collected by randomly selecting household addresses or aggregated at the zip code level, which is meant to be representative of the whole population, often excludes the homeless. Any data collected digitally could exclude those who are not engaged on these platforms.</p>
<p>Below are some common questions to consider when approaching data inclusion efforts with your project. </p>
<h5>Data Provenance</h5>
<p>Understand and interrogate how dataset was created or composed </p>
<ul>
<li>For what purpose was the dataset created? Was it for a specific task or initiative?</li>
<li>Who collected the data (e.g., a specific team or research group, field workers, etc.)?</li>
<li>How were the data collected? Were data acquired automatically, such as through web scraping, character recognition, behavioral data, etc.?</li>
<li>What incentives do the data collection agents have in reporting? Might somebody (consciously or subconsciously) be manipulating data to meet objectives?</li>
<li>If using pre-trained data from outside the partner organization, where do the data come from? What assumptions are made when using pre-trained data to answer project-specific questions? What potential biases may arise from using pre-trained data?</li>
</ul>
<h5>Data Inclusion and Disparities</h5>
<p>Identify and evaluate who is missing from the data </p>
<ul>
<li>Is there anything about the dataset and the way the data were collected, preprocessed, cleaned, and labeled that might impact the dataset’s future uses? How might these composition mechanisms negatively impact the way you intend to use data?</li>
<li>What variables are missing that might help you identify who is missing from your data set and check if it is truly representative and inclusive (e.g., race, gender, sexual orientation, etc.)? If possible, compare aggregated population statistics to the sample data in order to understand skews across demographics. Do you notice any differences? (e.g., population is 50% female, but data is 75% male) Note the global nature of DataKind projects, such that many protected categories like race differ by country.</li>
<li>What variables may be problematic to include in a model (e.g. race, gender, etc.)?</li>
<li>Did you detect anomalies during the Data Audit that may inform your execution plan?</li>
<li>Are there biases in the sample selection or variables made available? Interrogate available data and search for potential biases and limited representativeness of the problem. Keep <a href="http://jfoulds.informationsystems.umbc.edu/papers/2019/Foulds%20(2019)%20-%20DifferentialFairness_NeurIPS_MLWG.pdf">differential fairness</a> and intersectionality in mind as you evaluate the data.</li>
<li>Consider additional technical approaches to mitigate challenges with missing data (e.g., resampling). What approaches can you take to mitigate or estimate missing data? For example:</li>
<ul>
<li>Estimate the missing data using techniques, like interpolation (only use this option if you can do so with strong accuracy!)</li>
<li>Sample down to create a balanced class dataset that is equal to the population served</li>
<li>Create augmented data that adds in those excluded observations (carefully, with subject matter expert support)</li>
<li>Missing Data (Nulls): impute mean, regress a value (if correlated with other values, leave null if informational, or throw out if missing at random)</li>
</ul>
</ul>
<h5>Subject Matter Expertise and Project Specific Issues</h5>
<p>Incorporate domain expertise and previous research to ensure data representativeness and inclusivity.</p>
<ul>
<li>What domain expertise does the partner organization have to ensure data representativeness and inclusivity? What historical context can the partner organization and other domain experts provide to ensure that no group is underrepresented or systematically excluded in the data?</li>
<li>What types of harms can be anticipated from using your methodology, including previous research done in similar contexts? For example:</li>
<ul>
<li>Does the model provide allocation of a particular resource?</li>
<li>Could the model propagate harmful stereotypes or beliefs?</li>
</ul>
<li>Does your outcome of interest pertain to a theoretical construct (e.g. well-being, trust, happiness) that requires subject matter expertise to assess or interpret?</li>
<li>How is the outcome of interest being measured (e.g., self-reported, inferred from observation)? Ensure construct validity by working with your partner organization and other domain experts to question model assumptions.</li>
<li>Is there anything that the partner organization might need to know to avoid, which could result in unfair treatment of individuals or groups?</li>
<li>Could the model have unintended consequences for certain groups?</li>
<li>If the model is discovered to cause harm to a group of people, is there a mechanism to take the model offline and provide recourse to those affected?</li>
<li>Could the model be used for nefarious purposes? For example, could a tool made to predict student dropout rates be used to exclude - not support particular at-risk students?</li>
<li>Would it be possible to misinterpret the data and results?</li>
<li>Would the end users be able to understand the proposed output?</li>
</ul>
<h5>Additional resources:</h5>
<ul>
<li>Gebru, T., Morgenstern, J., Vecchione, B., Vaughan, J., Wallach, H., Daumé III, H., and Crawford, K. (2018). <a href="https://arxiv.org/pdf/1803.09010.pdf"> Datasheets for datasets</a>. arXiv preprint arXiv:1803.09010.</li>
<li>Suresh, H., and Guttag, J. (2019). <a href="https://arxiv.org/pdf/1901.10002.pdf">A framework for understanding unintended consequences of machine learning</a>. arXiv preprint arXiv:1901.10002.</li>
<li>Jacobs, A. (2021). <a href="https://arxiv.org/pdf/1912.05511.pdf">Measurement and fairness</a>. In Proceedings of the 2021 ACM Conference on Fairness, Accountability, and Transparency (pp. 375-385).</li>
</ul>
<p> <b>Contributer(s): </b>Caitlin Augustin, Benjamin Kinsella, Emily Yelverton, Jeremy Osborn, Manojit Nandi, Daniel Nissani, Phil Azar, William Ratcliff, Mallory Sheff</p>
</div>
</article>
<section class="wrapper-full bg-gray-light border-top">
<div class="wrapper-thin" style="display: flex; justify-content: center; align-items: center; padding: 30px;">
<div style="flex: 1; text-align: center; padding-right: 10px; max-width: 900px;">
<h5 class="h5 mb-4">Contact us</h5>
<p class="p-lg text-black" style="font-size: 16px;">If you would like to learn more about us, partner with us, or get in touch, email us at [email protected]</p>
</div>
<div style="flex: 1; text-align: center; padding-left: 10px; max-width: 900px;"> <!-- Reduced padding and increased max-width -->
<b class="f3">Subscribe to our newsletter</b>
<br>
<a class="button button--outline mt-2" href="https://www.datakind.org/subscribe/">
<span>Subscribe</span>
</a>
</div>
</div>
</section>
</main>
<footer class="footer bg-gray-light pt-1">
<div class="bg-gray-light">
<div class="container-xl p-responsive f6 py-4 d-sm-flex flex-justify-between flex-row-reverse flex-items-center">
<ul class="list-style-none d-flex flex-items-center mb-3 mb-sm-0 lh-condensed-ultra social-icons">
<li class="mr-4">
<a href="https://twitter.com/DataKind" data-analytics-click="Footer, go to Twitter, text:twitter" title="DataKind on Twitter" aria-label="GitHub on Twitter">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 273.5 222.3" class="d-block" height="18">
<path d="M273.5 26.3a109.77 109.77 0 0 1-32.2 8.8 56.07 56.07 0 0 0 24.7-31 113.39 113.39 0 0 1-35.7 13.6 56.1 56.1 0 0 0-97 38.4 54 54 0 0 0 1.5 12.8A159.68 159.68 0 0 1 19.1 10.3a56.12 56.12 0 0 0 17.4 74.9 56.06 56.06 0 0 1-25.4-7v.7a56.11 56.11 0 0 0 45 55 55.65 55.65 0 0 1-14.8 2 62.39 62.39 0 0 1-10.6-1 56.24 56.24 0 0 0 52.4 39 112.87 112.87 0 0 1-69.7 24 119 119 0 0 1-13.4-.8 158.83 158.83 0 0 0 86 25.2c103.2 0 159.6-85.5 159.6-159.6 0-2.4-.1-4.9-.2-7.3a114.25 114.25 0 0 0 28.1-29.1" fill="currentColor"></path>
</svg>
</a>
</li>
<li class="mr-4">
<a href="https://www.facebook.com/DataKindOrg/" data-analytics-click="Footer, go to Facebook, text:facebook" title="DataKind on Facebook" aria-label="GitHub on Facebook">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 15.3 15.4" class="d-block" height="18">
<path d="M14.5 0H.8a.88.88 0 0 0-.8.9v13.6a.88.88 0 0 0 .8.9h7.3v-6h-2V7.1h2V5.4a2.87 2.87 0 0 1 2.5-3.1h.5a10.87 10.87 0 0 1 1.8.1v2.1h-1.3c-1 0-1.1.5-1.1 1.1v1.5h2.3l-.3 2.3h-2v5.9h3.9a.88.88 0 0 0 .9-.8V.8a.86.86 0 0 0-.8-.8z" fill="currentColor"></path>
</svg>
</a>
</li>
<li class="mr-4">
<a href="https://www.youtube.com/@datakind6672" data-analytics-click="Footer, go to YouTube, text:youtube" title="DataKind on YouTube" aria-label="GitHub on YouTube">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 19.17 13.6" class="d-block" height="16">
<path d="M18.77 2.13A2.4 2.4 0 0 0 17.09.42C15.59 0 9.58 0 9.58 0a57.55 57.55 0 0 0-7.5.4A2.49 2.49 0 0 0 .39 2.13 26.27 26.27 0 0 0 0 6.8a26.15 26.15 0 0 0 .39 4.67 2.43 2.43 0 0 0 1.69 1.71c1.52.42 7.5.42 7.5.42a57.69 57.69 0 0 0 7.51-.4 2.4 2.4 0 0 0 1.68-1.71 25.63 25.63 0 0 0 .4-4.67 24 24 0 0 0-.4-4.69zM7.67 9.71V3.89l5 2.91z" fill="currentColor"></path>
</svg>
</a>
</li>
<li class="mr-4 flex-self-start">
<a href="https://www.linkedin.com/company/datakind/mycompany/" data-analytics-click="Footer, go to Linkedin, text:linkedin" title="DataKind on Linkedin" aria-label="GitHub on LinkedIn">
<svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 19 18" class="d-block" height="18">
<path d="M3.94 2A2 2 0 1 1 2 0a2 2 0 0 1 1.94 2zM4 5.48H0V18h4zm6.32 0H6.34V18h3.94v-6.57c0-3.66 4.77-4 4.77 0V18H19v-7.93c0-6.17-7.06-5.94-8.72-2.91z" fill="currentColor"></path>
</svg>
</a>
</li>
</ul>
<ul class="list-style-none d-flex text-gray">
<li class="mr-3">
© <span id="year"></span> DataKind
</li>
<li class="mr-3"><a href="https://docs.github.com/en/github/site-policy/github-terms-of-service" data-analytics-click="Footer, go to terms, text:terms" class="link-gray">Terms</a></li>
<li class="mr-3"><a href="https://docs.github.com/en/github/site-policy/github-privacy-statement" data-analytics-click="Footer, go to privacy, text:privacy" class="link-gray">Privacy</a></li>
<li><a href="#" class="link-gray">Cookie settings</a></li>
</ul>
</div>
</div>
<script>
var date = new Date().getFullYear(),
year = document.getElementById('year');
if(year) {
document.getElementById('year').innerHTML = date;
}
</script>
</footer>
<script src="https://www.google.com/recaptcha/api.js" async defer></script>
<script src="assets/js/gsap.min.js"></script>
<script src="assets/js/swiper-bundle.min.js"></script>
<script src="assets/js/bundle.js"></script>
<script src="assets/js/main.js"></script>
</body>
</html>