Scraping PA House/Senate Committees to JSON or CSV

The PA House/Senate contain a list of committee assignments, but there is no way to get this to an Excel format.

The below script does the following:

  • Extract the assignments as listed
  • Mark each rep to their committees and subcommitees
  • Link to district, party in a CSV

const fs = require('fs');
const house = fs.readFileSync('./scraping/legislature/House Member Committee Assignments - PA House of Representatives.html') + '';

const Papa = require('papaparse');
let cheerio = require('cheerio');


  const nameToDistrict = {};
  const allCommittees = {};
  
  `Name		Party	District
  Aerion Abney		D	19
  Joseph Adams		R	139
  Mike Armanini		R	75
  Jacob Banta		R	4
  Jamie Barton		R	124
  Anthony Bellmon		D	203
  Jessica Benham		D	36
  Kerry Benninghoff		R	171
  Aaron Bernstine		R	8
  Ryan Bizzarro		D	3
  Timothy Bonner		R	17
  Stephanie Borowicz		R	76
  Lisa Borowski		D	168
  Heather Boyd		D	163
  Kevin Boyle		D	172
  Matthew Bradford		D	70
  Tim Brennan		D	29
  Tim Briggs		D	149
  Amen Brown		D	10
  Marla Brown		R	9
  Donna Bullock		D	195
  Danilo Burgos		D	197
  Frank Burns		D	72
  Mike Cabell		R	117
  Martin Causer		R	67
  Johanny Cepeda-Freytiz		D	129
  Morgan Cephas		D	192
  Melissa Cerrato		D	151
  Joe Ciresi		D	146
  Scott Conklin		D	77
  Bud Cook		R	50
  Jill Cooper		R	55
  Gina Curry		D	164
  Bryan Cutler		R	100
  Joseph D'Orsie		R	47
  Mary Jo Daley		D	148
  Eric Davanzo		R	58
  Tina Davis		D	141
  Jason Dawkins		D	179
  Daniel Deasy		D	27
  David Delloso		D	162
  Sheryl Delozier		R	88
  Russ Diamond		R	102
  Kyle Donahue		D	113
  George Dunbar		R	56
  Torren Ecker		R	193
  Joe Emrick		R	137
  Mindy Fee		R	37
  Elizabeth Fiedler		D	184
  Wendy Fink		R	94
  Justin Fleming		D	105
  Jamie Flick		R	83
  Ann Flood		R	138
  Dan Frankel		D	23
  Robert Freeman		D	136
  Paul Friel		D	26
  Jonathan Fritz		R	111
  Pat Gallagher		D	173
  John Galloway		D	140
  Valerie Gaydos		R	44
  Matthew Gergely		D	35
  Mark Gillen		R	128
  Jose Giral		D	180
  Barbara Gleim		R	199
  G. Roni Green		D	190
  Jim Gregory		R	80
  Keith Greiner		R	43
  Seth Grove		R	196
  Nancy Guenst		D	152
  Manuel Guzman Jr.		D	127
  Jim Haddock		D	118
  Joe Hamm		R	84
  Liz Hanbidge		D	61
  Patrick Harkins		D	1
  Jordan Harris		D	186
  Doyle Heffley		R	122
  Carol Hill-Evans		D	95
  Joe Hogan		R	142
  Joseph C. Hohenstein		D	177
  Kristine Howard		D	167
  Rich Irvin		R	81
  MaryLouise Isaacson		D	175
  R. Lee James		R	64
  Mike Jones		R	93
  Tom Jones		R	98
  Barry Jozwiak		R	5
  Joshua Kail		R	15
  Aaron Kaufer		R	120
  Rob Kauffman		R	89
  Carol Kazeem		D	159
  Dawn Keefer		R	92
  Malcolm Kenyatta		D	181
  Dallas Kephart		R	73
  Joe Kerwin		R	125
  Tarik Khan		D	194
  Patty Kim		D	103
  Emily Kinkead		D	20
  Stephen Kinsey		D	201
  Kate Klunk		R	169
  Bridget Kosierowski		D	114
  Rick Krajewski		D	188
  Leanne Krueger		D	161
  Charity Grimm Krupa		R	51
  Anita Astorino Kulik		D	45
  Thomas Kutz		R	87
  Andrew Kuzma		R	39
  Shelby Labs		R	143
  John Lawrence		R	13
  Robert Leadbeter		R	109
  Milou Mackenzie		R	131
  Ryan Mackenzie		R	187
  Maureen Madden		D	115
  Dave Madsen		D	104
  Abby Major		R	60
  Zachary Mako		R	183
  Steven Malagari		D	53
  David M. Maloney Sr.		R	130
  Kristin Marcell		R	178
  Brandon Markosek		D	25
  Jim Marshall		R	14
  Robert Matzie		D	16
  La'Tasha Mayes		D	24
  Joe McAndrew		D	32
  Joanna McClinton		D	191
  Jeanne McNeill		D	133
  Thomas L. Mehaffie III		R	106
  Steven Mentzer		R	97
  Robert Mercuri		R	28
  Robert Merski		D	2
  Carl Walker Metzgar		R	69
  Natalie Mihalek		R	40
  Brett Miller		R	41
  Dan Miller		D	42
  Dan Moul		R	91
  Kyle Mullins		D	112
  Brian Munroe		D	144
  Marci Mustello		R	11
  Ed Neilson		D	174
  Eric Nelson		R	57
  Napoleon Nelson		D	154
  Jennifer O'Mara		D	165
  Timothy O'Neal		R	48
  Donna Oberlander		R	63
  Jason Ortitay		R	46
  Danielle Friel Otten		D	155
  Clint Owlett		R	68
  Darisha Parker		D	198
  Eddie Day Pashinski		D	121
  Tina Pickett		R	110
  Chris Pielli		D	156
  Nick Pisciottano		D	38
  Tarah Probst		D	189
  Christopher Rabb		D	200
  Jack Rader Jr.		R	176
  Kathy Rapp		R	65
  Jim Rigby		R	71
  Brad Roae		R	6
  Leslie Rossi		R	59
  David H. Rowe		R	85
  Mark Rozzi		D	126
  Alec Ryncavage		R	119
  Abigail Salisbury		D	34
  Steve Samuelson		D	135
  Benjamin Sanchez		D	153
  Christina Sappey		D	158
  Paul Schemel		R	90
  Donna Scheuren		R	147
  John Schlegel		R	101
  Michael Schlossberg		D	132
  Louis C. Schmitt Jr.		R	79
  Peter Schweyer		D	134
  Stephenie Scialabba		R	12
  Greg Scott		D	54
  Melissa Shusterman		D	157
  Joshua Siegel		D	22
  Brian Smith		R	66
  Ismail Smith-Wade-El		D	49
  Jared Solomon		D	202
  Craig Staats		R	145
  Perry Stambaugh		R	86
  Mandy Steele		D	33
  Joanne Stehr		R	107
  Michael Stender		R	108
  James B. Struzzi II		R	62
  P. Michael Sturla		D	96
  Paul Takac		D	82
  Kathleen Tomlinson		R	18
  Jesse Topper		R	78
  Tim Twardzik		R	123
  Arvind Venkat		D	30
  Greg Vitali		D	166
  Ryan Warner		R	52
  Perry Warren		D	31
  Dane Watro		R	116
  Ben Waxman		D	182
  Joe Webster		D	150
  Parke Wentling		R	7
  Martina White		R	170
  Craig Williams		R	160
  Dan Williams		D	74
  Regina Young		D	185
  David Zimmerman		R	99
  Lindsay Powell	Representative Lindsay Powell - PA House of Representatives (state.pa.us)	D	21
  `.split("\n").map(
    (row) => {
      const [name, party, something, district] = row.trim().split("\t");

      nameToDistrict[name.trim()] = (district || '').trim();
    }
  );



  const members = [];

  let $ = cheerio.load(house);
  $('.MemberInfoCteeList-Member').map(
    (index, element) => {
      const bioElt = $(element).children('.MemberInfoCteeList-Bio');
      let name = bioElt.text().trim();


      let parts = name.split(",");
      let lastIndex = parts.length - 1;
      console.log('parts[lastIndex]', parts[lastIndex])
      let lastComponents = parts[lastIndex].trim().split(' ');

      let party = lastComponents.pop();

      //console.log('lastPart', lastPart)
      console.log('party', party)
      parts[lastIndex] = lastComponents.join(' ');

      console.log('name', name);
      console.log('parts', parts);
      console.log(party);
      party = party.substring(1, 2);
      let first = parts[0];
      parts[0] = parts[1];
      parts[1] = first;

      let realName = parts.join(" ").trim();

      const record = {};
      record.party = party;
      record.name = realName;
      record.district = '';

      if (nameToDistrict[record.name]) {
        record.district = nameToDistrict[record.name];
      } else {
        console.log(record.name);

        let name2 = record.name.replace(/ \w[.] /, " ");
        if (nameToDistrict[name2]) {
          record.district = nameToDistrict[name2];
        } else {
          throw record.name;
        }
      }


      let lastEntry = '';
      let lastCommitee = '';

      const committeesElt = $(bioElt).next().children().children().children().toArray().map(
        (elt) => {
          const kids = $(elt).children().toArray();
          console.log(
            kids.map(k => {
              let role = "Member";

              let txt = $(k).text();

              if (txt.indexOf(", ") === 0) {
                txt = txt.substring(2);

                record[lastEntry] = txt.trim();
                allCommittees[lastEntry] = '';
              } else {
                if (txt.indexOf("-") > 0) {
                  let parts = txt.split("-");
                  txt = parts[0].trim();
                  role = parts[1].trim();
                }

                if (txt.indexOf("Subcommittee ") < 0) {
                  record[txt] = role.trim();

                  allCommittees[txt] = '';

                  lastCommitee = txt.trim();
                  lastEntry = txt.trim();
                } else {
                  record[lastCommitee + " - " + txt] = role.trim();
                  allCommittees[lastCommitee + " - " + txt] = '';
                  lastEntry = (lastCommitee + " - " + txt).trim();
                }
              }

              //console.log(JSON.stringify(members, null, 2));

              return txt;
            })
          )
        }
      );

      members.push(record);
    }
  )

  members.map(
    (member) => {
      Object.keys(allCommittees).map(
        (comm) => {
          if (!member.hasOwnProperty(comm)) {
            member[comm] = '';
          }
        }
      )
    }
  )



  fs.writeFileSync('house.json', JSON.stringify(members, null, 2));
  fs.writeFileSync('house.tsv', Papa.unparse(members));
}