Minh T. Vo

325 Columbus Ave Apt 11 | Boston, MA 02116 | 508-423-2199 | minh.ta.vogmail.com


Skip to main content.

Minh Vo's Online Résumé - Information Systems Developer

SAMPLE PERL CODE

Extracts MCAS question data (raw HTML) from the Department of Education MCAS Web Application.

The script can take command-line options:

  • -p (page number or page range)
  • -s (page to start fetching)
  • -e (page to end fetching)
  • -y (filter by year code)
  • -g (filter by grade level)
  • -c (filter by category code)
  • -h (help)

DOWNLOAD : [ mcasparser.pl ]

   1:  #!/usr/bin/perl
   2:  ########################################################################
   3:  ## Retrieve raw HTML text from the DOE's MCAS Question Search site
   4:  ## and parse it into usable data format.  Basically, we trying to copy
   5:  ## all MCAS Questions available from
   6:  ##
   7:  ##   http://www.doe.mass.edu/mcas/search/default.asp
   8:  ##
   9:  ## This script uses a tree model aproach in looking for and extracting
  10:  ## data from the raw text.  See "HOW WE REPRESENT TREES" section at
  11:  ## http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.pm
  12:  ## for explaination.
  13:  ##
  14:  ## First, we build a tree-structure from the HTML text downloaded from
  15:  ## MCAS site using HTML::TreeBuilder module.  Once the tree is built,
  16:  ## it is fairly easy to search and extract contents we need.  From the
  17:  ## HTML tree illustrated in the above link, and according to how current
  18:  ## MCAS page is layed out, we can retrieve all the questions from the
  19:  ## search result by:
  20:  ##
  21:  ## * From root of the tree (HTML)
  22:  ## * Find the 2nd <FORM>...</FORM> node (traversing down/up and from
  23:  ##   left to right)
  24:  ## * Inside the <FORM> node found, questions can be found inside each of
  25:  ##   the <TABLE>...</TABLE> nodes immediately below <FORM>, with
  26:  ##   exception to the first 2 and last <TABLE> nodes which are the
  27:  ##   form's drop-down boxes and navigation menu, etc.
  28:  ## * Within each <TABLE> node, we can then go further down to extract
  29:  ##   data.  For example:
  30:  ##     1st <TR> contain the year, subject, grade, question#, etc.
  31:  ##     2nd <TR> contain the "Standard:" data
  32:  ##     3rd <TR> could be blank or contain images such as "no-calculator"
  33:  ##     4th <TR> contain the question text (or link to "view reading
  34:  ##              selection"
  35:  ##     and so on...
  36:  ## * Once we get to this level, we can then use regular expression to
  37:  ##   parse/extract the data we need.
  38:  ##
  39:  ##
  40:  ## For further explaination, see:
  41:  ## http://209.85.173.104/search?q=cache:GS4LZ7uaOyEJ:www.foo.be/docs/tpj/issues/vol5_3/tpj0503-0008.html+HTML::Element+content_list+problem&hl=en&ct=clnk&cd=2&gl=us&client=firefox-a
  42:  ##
  43:  ########################################################################
  44:   
  45:   
  46:  use File::Basename;
  47:  use LWP::UserAgent;
  48:  use HTML::TreeBuilder;
  49:  use Getopt::Std;
  50:   
  51:  use strict;
  52:   
  53:  ########################################################################
  54:  ## GLOBAL VARS
  55:  ########################################################################
  56:   
  57:  ##
  58:  ## MCAS root URL
  59:  ##
  60:  my $MCAS_URL = 'http://www.doe.mass.edu/mcas/search/default.asp';
  61:   
  62:  ##
  63:  ## Parameters the above URL accept and default values
  64:  ##
  65:  my $MCAS_URL_Params = {
  66:      YearCode              => 2007,
  67:      GradeID               => '%25',          ## URL encoded '%' character
  68:      QuestionCategory      => '',
  69:      FormSubmitted         => 'yes',
  70:      ReportingCategoryCode => '',
  71:      ShowReportingCategory => '',
  72:      originalpage          => 1,
  73:      page                  => 1,
  74:      answers               => 'on',
  75:      intro                 => 'no',
  76:      advanced_search        => '%25'
  77:  };
  78:   
  79:  ##
  80:  ## Prevent run away script in which we repeatedly fetch the same
  81:  ## content (or looping through pages) over and over.
  82:  ##
  83:  my $Max_Requests = 200;
  84:   
  85:  ## Debugging flag, print out extra stuff
  86:  my $DEBUG = 6;
  87:   
  88:  ## total number of questions found using above $MCAS_URL_Params
  89:  ## search parameters
  90:  my $Total_Questions = 0;
  91:   
  92:  ## highest question number seen in the last page fetched
  93:  my $Last_Seen_Question = -1;
  94:   
  95:  ## flag indicates we've already fetched the last page
  96:  my $_On_Last_Page = 0;
  97:   
  98:  ## hold the LWP::UserAgent object
  99:  my $UA;
 100:   
 101:  ## hold command line options
 102:  my %Opt;
 103:   
 104:  ## hold optional pages needed to fetch (from command line arguments)
 105:  my @_Pages_To_Fetch = ();
 106:   
 107:  ## if passed by command line (-e) this indicates the last page we
 108:  ## will fetch even if the MCAS search result contain more.
 109:  ## ( '0' means no limit )
 110:  my $_Page_Limit = 0;
 111:   
 112:  ## counter for the number of requests we sent to DOE web site
 113:  my $_Fetch_Counter = 0;
 114:   
 115:  ########################################################################
 116:  ## MAIN
 117:  ########################################################################
 118:   
 119:  ## get command line option
 120:  getopts("c:e:g:hp:s:ty:", \%Opt);
 121:   
 122:  ## print help page and exit
 123:  if ( $Opt{'h'} ) {
 124:      usage();
 125:      exit;
 126:  }
 127:   
 128:  ## fetch different year's data than the default
 129:  if ( $Opt{'y'} and ($Opt{'y'} >= 2003) ) {
 130:      $MCAS_URL_Params->{YearCode} = $Opt{'y'};
 131:  }
 132:   
 133:  ## fetch questions for grade level...
 134:  if ( $Opt{'g'} ) {
 135:      $MCAS_URL_Params->{GradeID} = $Opt{'g'};
 136:  }
 137:   
 138:  ## fetch questions in Category...
 139:  if ( $Opt{'c'} ) {
 140:      $MCAS_URL_Params->{QuestionCategory} = $Opt{'c'};
 141:  }
 142:   
 143:  ## start with page number 'n' rather than the default '1'
 144:  if ( $Opt{'s'} and ($Opt{'s'} > 0) ) {
 145:      $MCAS_URL_Params->{page} = $Opt{'s'};
 146:  }
 147:   
 148:  ## do not fetch past page 'n'
 149:  if ( $Opt{'e'} and ($Opt{'e'} > $MCAS_URL_Params->{page}) ) {
 150:      $_Page_Limit = $Opt{'e'};
 151:  }
 152:   
 153:  ## fetch only certain page number (default is to fetch all pages)
 154:  if ( $Opt{'p'} ) {
 155:      @_Pages_To_Fetch = parse_cli_option_p($Opt{'p'});
 156:      print STDERR "Will fetch pages: ", join(',', @_Pages_To_Fetch), "\n" if $DEBUG;
 157:      ## this option override '-e' (reset $_Page_Limit to unlimit)
 158:      $_Page_Limit = 0;
 159:  }
 160:   
 161:  while ( my $raw_html = mcas_get_next_page(shift(@_Pages_To_Fetch)) ) {
 162:   
 163:      ## if there are more pages to fetch, reset $_On_Last_Page flag set
 164:      ## by mcas_get_next_page()
 165:      if ( @_Pages_To_Fetch ) {
 166:          $_On_Last_Page = 0;
 167:      }
 168:   
 169:      ## new HTML::TreeBuilder object
 170:      my $root = HTML::TreeBuilder->new();
 171:   
 172:      ## parse the raw HTML returned from mcas_get_next_page.  move on
 173:      ## if parse() failed
 174:      $root->parse_content($raw_html) or next;
 175:   
 176:      ##
 177:      ## all the questions are inside the 2nd <FORM> node
 178:      ## 
 179:   
 180:      my $form = ($root->find('form'))[1];
 181:   
 182:      ## just making sure we found that <FORM> node.  move on if not
 183:      if ( ! $form or ! ref($form) ) {
 184:          next;
 185:      }
 186:   
 187:      ## detach the <FORM> node found from the initial <HTML> documenent
 188:      ## tree, $root, so we could get rid of $root to conserve memory
 189:      $form->detach;
 190:      $root->delete;
 191:   
 192:      ## all questions found on this page will be stored in this array
 193:      my @question_nodes = ();
 194:   
 195:      ## get only the <TABLE> nodes (that's where all questions are)
 196:      @question_nodes = content_list_by_tag_name($form, 'table');
 197:   
 198:      ## remove the first 2 <TABLE> nodes since they are the search and
 199:      ## navigation/display boxes and not the actual questions.
 200:      for (my $i=0; $i<2; $i++) {
 201:          shift @question_nodes;
 202:      }
 203:      ## and the last empty <TABLE> (spacer)
 204:      pop @question_nodes;
 205:   
 206:      foreach my $node (@question_nodes) {
 207:          my $q_info = mcas_parse_question($node);
 208:          print STDERR "=============================================\n" if $DEBUG;
 209:          print STDERR join("\n", map{ $_ . ' => "'. $q_info->{$_} . '"' } keys %{$q_info}), "\n" if $DEBUG > 5;
 210:          print STDERR "Children nodes: ", scalar($node->content_list()), "\n" if $DEBUG > 5;
 211:      }
 212:   
 213:      ## cleanup
 214:      @question_nodes = ();
 215:      $form->delete();
 216:   
 217:  }
 218:   
 219:  exit;
 220:   
 221:  ########################################################################
 222:  ## SUBROUTINES
 223:  ########################################################################
 224:   
 225:  sub usage
 226:  {
 227:      my $progname = basename($0);
 228:   
 229:      print <<_EOH_
 230:  Usage: $progname [[-p <page#>,<page#>,...] | [-s <page#> [-e <page#>]]]
 231:              [-y <year_code>] [-g <grade_code>] [-c <category_code>]
 232:         $progname -h
 233:   
 234:      -p <page#>
 235:         Comma separated list of page number to retrieve.  Page number can
 236:         be a range, i.e. 10-14). (default is all pages available)
 237:      -s <page#>
 238:         Retrieve questions starting on page <page#>.
 239:      -e <page#>
 240:         Stop at page <page#>.
 241:      -y <year_code>
 242:         Only retrieve question of year <year_code>. (default is '2007')
 243:      -g <grade_code>
 244:         Only retrieve questions for grade level <grade_code>.
 245:         (default 'all')
 246:      -c <category_code>
 247:         Only retrieve questions in category <category_code>. (default 'all')
 248:      -h Print this help page.
 249:   
 250:  NOTE: -p option supercedes -s and -e
 251:  _EOH_
 252:  ;
 253:   
 254:  }
 255:   
 256:  ##################################################
 257:  ## parse out pages we want to fetch (passed in from command line
 258:  ## option)
 259:  ##
 260:  sub parse_cli_option_p
 261:  {
 262:      my $opt_str = shift;
 263:      my @opts = ();
 264:   
 265:      if ( $opt_str ) {
 266:          ## split by commas
 267:          my @arr = split(/\,/, $opt_str);
 268:   
 269:          ## go through each part and do further spliting
 270:          ## cuz page number could be a range, '6-8' for example.
 271:          foreach my $elem (@arr) {
 272:              if ( $elem =~ /^(\d+)\-(\d+)$/ ) {
 273:                  for (my $i=$1; $i<=$2; $i++) {
 274:                      push(@opts, $i);
 275:                  }
 276:              }
 277:              elsif ( $elem =~ /^\d+$/ ) {
 278:                  push(@opts, $elem);
 279:              }
 280:          }
 281:      }
 282:      return @opts;
 283:  }
 284:  ##################################################
 285:  ## Initializing the $UA LWP::UserAgent object
 286:  ##
 287:  sub init_ua
 288:  {
 289:      if ( $UA and ref($UA) eq 'LWP::UserAgent' ) {
 290:          ## $UA already init'ed
 291:          return $UA;
 292:      }
 293:   
 294:      print STDERR "Initializing LWP::UserAgent object\n" if $DEBUG;
 295:      $UA = LWP::UserAgent->new(
 296:              ## quit if we can't establish connection after 10 seconds
 297:              timeout    => 10,
 298:   
 299:              ## User-agent string (faking firefox 1.0.8 on XP)
 300:              agent      => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.13) Gecko/20060410 Firefox/1.0.8',
 301:   
 302:              ## cookie jar
 303:              cookie_jar => {}
 304:      );
 305:   
 306:      return $UA;
 307:  }
 308:   
 309:  ##################################################
 310:  ## get the next page from MCAS Search interface
 311:  ##
 312:  sub mcas_get_next_page
 313:  {
 314:      my $page_no = shift;  # optional page number
 315:   
 316:      if ( $_On_Last_Page ) {
 317:          print STDERR "Already fetched the last page.  I'm stopping here.\n";
 318:          return undef;
 319:      }
 320:   
 321:      ## do not go past the page number specified by '-e'
 322:      if ( $_Page_Limit and ($MCAS_URL_Params->{page} > $_Page_Limit) ) {
 323:          print STDERR "Maximum page number reached ($_Page_Limit).\n";
 324:          return undef;
 325:      }
 326:   
 327:      ## runaway script?????
 328:      if ( $_Fetch_Counter >= $Max_Requests ) {
 329:          print STDERR "Maximum DOE search request reached ($Max_Requests).\n";
 330:          return undef;
 331:      }
 332:   
 333:      ##
 334:      ## ok, fetch the next page...
 335:      ##
 336:   
 337:      ## if $page_no is provided, fetch that one instead of current
 338:      ## value in $MCAS_URL_Params->{page}.  Also, set $_On_Last_Page so
 339:      ## next call won't happen
 340:      if ( $page_no ) {
 341:          $MCAS_URL_Params->{page} = $page_no;
 342:          $_On_Last_Page = 1;
 343:      }
 344:   
 345:      ## build the URL query part from $MCAS_URL_Params
 346:      ## (i.e. name1=value1&name2=value2 and so on)
 347:      my $url_query = join('&', map { $_ . '=' . $MCAS_URL_Params->{$_} } keys(%{$MCAS_URL_Params}));
 348:   
 349:   
 350:      ## initialize LWP::UserAgent
 351:      init_ua();
 352:   
 353:      print STDERR "Fetching page ", $MCAS_URL_Params->{page}, " from MCAS site" if $DEBUG;
 354:      print STDERR "  ${MCAS_URL}?${url_query}\n" if $DEBUG > 3;
 355:      my $resp = $UA->get("${MCAS_URL}?${url_query}");
 356:   
 357:      ## increment $_Fetch_Counter
 358:      $_Fetch_Counter++;
 359:   
 360:      if ( $resp->is_success ) {
 361:          print STDERR ": ", $resp->status_line, "\n" if $DEBUG;
 362:          my $content = $resp->content;
 363:   
 364:          ## increment page number
 365:          $MCAS_URL_Params->{page}++;
 366:   
 367:          ##
 368:          ## initialize global vars if they haven't been init'ed
 369:          ##
 370:   
 371:          ## get the total questions found
 372:          if ( ! $Total_Questions ) {
 373:              ## should use index & substr instead of the could-be-slow-ass
 374:              ## regex, but what the heck...
 375:              if ( $content =~ /Showing +\d+ +to +\d+ +of +(\d+)/ ) {
 376:                  $Total_Questions = $1;
 377:              }
 378:          }
 379:   
 380:          ## another safe-guard feature... if there is a next page to fetch,
 381:          ## the 'Next' in the page should be enclosed in an <a> tag.  If
 382:          ## that's not the case then we should prevent the next call to
 383:          ## mcas_get_next_page() from happening
 384:          if ( index($content, 'Next</a>') < 0 ) {
 385:              $_On_Last_Page = 1;
 386:          }
 387:   
 388:          return $content;
 389:      }
 390:      else {
 391:          print STDERR ": ", $resp->status_line, "\n";
 392:      }
 393:   
 394:      return undef;
 395:  }
 396:   
 397:  ##################################################
 398:  ## similar to HTML::Element::content_list() but only return
 399:  ## nodes that has certain tagname that we want
 400:  ##
 401:  sub content_list_by_tag_name
 402:  {
 403:      my $root = shift;
 404:      my @tags  = @_;
 405:      my @nodes;
 406:   
 407:      return undef if ! $root;
 408:      if ( ref($root) ) {
 409:          foreach my $node ($root->content_list()) {
 410:              if ( ref($node) ) {
 411:                  if ( @tags ) {
 412:                      ## only look for tags that we wants
 413:                      foreach my $wanted_tag (@tags) {
 414:                          if ( $node->tag eq lc($wanted_tag) ) {
 415:                              push(@nodes, $node);
 416:                              last;
 417:                          }
 418:                      }
 419:                  }
 420:                  else {
 421:                      ## get everything
 422:                      push(@nodes, $node);
 423:                  }
 424:              }
 425:          }
 426:      }
 427:      return @nodes;
 428:  }
 429:   
 430:  ##################################################
 431:  ## parse/extract info from the HTML::Element which has the MCAS question
 432:  ## data.
 433:  ##
 434:  sub mcas_parse_question
 435:  {
 436:      my $node = shift;
 437:      my $result = {};
 438:   
 439:      return undef if ! $node or ! ref($node);
 440:   
 441:      ## we know from the HTML source that each question is enclosed
 442:      ## in a <TABLE>.  Basically, the structure of the question (tree) is:
 443:      ##
 444:      ## <table>
 445:      ##   <tbody>
 446:      ##     <tr> => the year, , grade, question#, question type, strand
 447:      ##     <tr> => standard
 448:      ##     <tr> ... so on
 449:      ##
 450:   
 451:      if ( $node->tag eq 'table' ) {
 452:          my @q_parts = content_list_by_tag_name($node, 'tr');
 453:   
 454:          if ( @q_parts ) {
 455:              $result = mcas_parse_question_metadata(shift @q_parts);
 456:              $result->{standard} = mcas_parse_question_standard(shift @q_parts);
 457:          }
 458:          $result->{raw_html} = $node->as_HTML;
 459:      }
 460:   
 461:      return $result;
 462:  }
 463:   
 464:  ##################################################
 465:  ## parse the meta info from the question node/block
 466:  ## (the first <TR>)
 467:  ## return a hash ref with year,subject,grade,q#,q_type,strand
 468:  ##
 469:  sub mcas_parse_question_metadata
 470:  {
 471:      my $node = shift;
 472:      my $result = {};
 473:   
 474:      return undef if ! $node or ! ref($node);
 475:   
 476:      if ( $node->tag eq 'tr' ) {
 477:          ## colapse this node into raw HTML string which will be
 478:          ## easier to parse
 479:          my $node_str = $node->as_HTML;
 480:   
 481:          ## split into 3 element (1st, 2nd, 3rd line as appear in browser)
 482:          my @lines = split(/\<br\s*\/\>/, $node_str);
 483:   
 484:          ## line 1 looks like this:
 485:          ## <tr><td class="lg" colspan="2" width="550">2007, Biology - High School
 486:          if ( $lines[0] =~ /\> *(\d+) *, +(.*) +\- +(.*) *$/ ) {
 487:              $result->{year}    = $1;
 488:              $result->{subject} = $2;
 489:              $result->{grade}   = $3;
 490:          }
 491:   
 492:          ## line 2 looks like this:
 493:          ## Question 7: Multiple-Choice
 494:          if ( $lines[1] =~ /Question +(\d+): +(.*) *$/ ) {
 495:              $result->{question_number}    = $1;
 496:              $result->{question_type} = $2;
 497:          }
 498:          else {
 499:              ## some question doesn't have "Question #:"
 500:              ## (first question in '2007', 'All', '-- ENGLISH LANGUAGE ARTS --')
 501:              $result->{question_type} = $lines[1];
 502:          }
 503:   
 504:          ## line 3 looks like this:
 505:          ## <span class="nav em">Reporting Category: Biochemistry and Cell Biology</span>&nbsp;<span class="nav em"></span></td></tr>
 506:          if ( $lines[2] =~ /Category: +([^\<]+)\</ ) {
 507:              $result->{strand}    = $1;
 508:          }
 509:      }
 510:   
 511:      return $result;
 512:  }
 513:   
 514:  ##################################################
 515:  ## parse the meta info from the question node/block
 516:  ## (the 2nd <TR>)
 517:  ## return string (question's Standard)
 518:  ##
 519:  sub mcas_parse_question_standard
 520:  {
 521:      my $node = shift;
 522:      my $result = '';
 523:   
 524:      return undef if ! $node or ! ref($node);
 525:   
 526:      if ( $node->tag eq 'tr' ) {
 527:          ## colapse this node into raw HTML string which will be
 528:          ## easier to parse
 529:          my $node_str = $node->as_HTML;
 530:   
 531:          ## line should looks like this:
 532:          ## <tr><td class="nav em" colspan="2">Standard: Structure and Origins of Modern English - 5</td><td>&nbsp;</td></tr>
 533:          if ( $node_str =~ /\>Standard: +([^\<]+)\</ ) {
 534:              $result = $1;
 535:          }
 536:      }
 537:   
 538:      return $result;
 539:  }