Minh Vo's Online Résumé - Information Systems Developer
SAMPLE PERL CODE
Extracts MCAS question data (raw HTML) from the Department of Education MCAS Web Application.
The script can take command-line options:
- -p (page number or page range)
- -s (page to start fetching)
- -e (page to end fetching)
- -y (filter by year code)
- -g (filter by grade level)
- -c (filter by category code)
- -h (help)
DOWNLOAD : [ mcasparser.pl ]
1: #!/usr/bin/perl
2: ########################################################################
3: ## Retrieve raw HTML text from the DOE's MCAS Question Search site
4: ## and parse it into usable data format. Basically, we trying to copy
5: ## all MCAS Questions available from
6: ##
7: ## http://www.doe.mass.edu/mcas/search/default.asp
8: ##
9: ## This script uses a tree model aproach in looking for and extracting
10: ## data from the raw text. See "HOW WE REPRESENT TREES" section at
11: ## http://search.cpan.org/~petek/HTML-Tree-3.23/lib/HTML/Element.pm
12: ## for explaination.
13: ##
14: ## First, we build a tree-structure from the HTML text downloaded from
15: ## MCAS site using HTML::TreeBuilder module. Once the tree is built,
16: ## it is fairly easy to search and extract contents we need. From the
17: ## HTML tree illustrated in the above link, and according to how current
18: ## MCAS page is layed out, we can retrieve all the questions from the
19: ## search result by:
20: ##
21: ## * From root of the tree (HTML)
22: ## * Find the 2nd <FORM>...</FORM> node (traversing down/up and from
23: ## left to right)
24: ## * Inside the <FORM> node found, questions can be found inside each of
25: ## the <TABLE>...</TABLE> nodes immediately below <FORM>, with
26: ## exception to the first 2 and last <TABLE> nodes which are the
27: ## form's drop-down boxes and navigation menu, etc.
28: ## * Within each <TABLE> node, we can then go further down to extract
29: ## data. For example:
30: ## 1st <TR> contain the year, subject, grade, question#, etc.
31: ## 2nd <TR> contain the "Standard:" data
32: ## 3rd <TR> could be blank or contain images such as "no-calculator"
33: ## 4th <TR> contain the question text (or link to "view reading
34: ## selection"
35: ## and so on...
36: ## * Once we get to this level, we can then use regular expression to
37: ## parse/extract the data we need.
38: ##
39: ##
40: ## For further explaination, see:
41: ## http://209.85.173.104/search?q=cache:GS4LZ7uaOyEJ:www.foo.be/docs/tpj/issues/vol5_3/tpj0503-0008.html+HTML::Element+content_list+problem&hl=en&ct=clnk&cd=2&gl=us&client=firefox-a
42: ##
43: ########################################################################
44:
45:
46: use File::Basename;
47: use LWP::UserAgent;
48: use HTML::TreeBuilder;
49: use Getopt::Std;
50:
51: use strict;
52:
53: ########################################################################
54: ## GLOBAL VARS
55: ########################################################################
56:
57: ##
58: ## MCAS root URL
59: ##
60: my $MCAS_URL = 'http://www.doe.mass.edu/mcas/search/default.asp';
61:
62: ##
63: ## Parameters the above URL accept and default values
64: ##
65: my $MCAS_URL_Params = {
66: YearCode => 2007,
67: GradeID => '%25', ## URL encoded '%' character
68: QuestionCategory => '',
69: FormSubmitted => 'yes',
70: ReportingCategoryCode => '',
71: ShowReportingCategory => '',
72: originalpage => 1,
73: page => 1,
74: answers => 'on',
75: intro => 'no',
76: advanced_search => '%25'
77: };
78:
79: ##
80: ## Prevent run away script in which we repeatedly fetch the same
81: ## content (or looping through pages) over and over.
82: ##
83: my $Max_Requests = 200;
84:
85: ## Debugging flag, print out extra stuff
86: my $DEBUG = 6;
87:
88: ## total number of questions found using above $MCAS_URL_Params
89: ## search parameters
90: my $Total_Questions = 0;
91:
92: ## highest question number seen in the last page fetched
93: my $Last_Seen_Question = -1;
94:
95: ## flag indicates we've already fetched the last page
96: my $_On_Last_Page = 0;
97:
98: ## hold the LWP::UserAgent object
99: my $UA;
100:
101: ## hold command line options
102: my %Opt;
103:
104: ## hold optional pages needed to fetch (from command line arguments)
105: my @_Pages_To_Fetch = ();
106:
107: ## if passed by command line (-e) this indicates the last page we
108: ## will fetch even if the MCAS search result contain more.
109: ## ( '0' means no limit )
110: my $_Page_Limit = 0;
111:
112: ## counter for the number of requests we sent to DOE web site
113: my $_Fetch_Counter = 0;
114:
115: ########################################################################
116: ## MAIN
117: ########################################################################
118:
119: ## get command line option
120: getopts("c:e:g:hp:s:ty:", \%Opt);
121:
122: ## print help page and exit
123: if ( $Opt{'h'} ) {
124: usage();
125: exit;
126: }
127:
128: ## fetch different year's data than the default
129: if ( $Opt{'y'} and ($Opt{'y'} >= 2003) ) {
130: $MCAS_URL_Params->{YearCode} = $Opt{'y'};
131: }
132:
133: ## fetch questions for grade level...
134: if ( $Opt{'g'} ) {
135: $MCAS_URL_Params->{GradeID} = $Opt{'g'};
136: }
137:
138: ## fetch questions in Category...
139: if ( $Opt{'c'} ) {
140: $MCAS_URL_Params->{QuestionCategory} = $Opt{'c'};
141: }
142:
143: ## start with page number 'n' rather than the default '1'
144: if ( $Opt{'s'} and ($Opt{'s'} > 0) ) {
145: $MCAS_URL_Params->{page} = $Opt{'s'};
146: }
147:
148: ## do not fetch past page 'n'
149: if ( $Opt{'e'} and ($Opt{'e'} > $MCAS_URL_Params->{page}) ) {
150: $_Page_Limit = $Opt{'e'};
151: }
152:
153: ## fetch only certain page number (default is to fetch all pages)
154: if ( $Opt{'p'} ) {
155: @_Pages_To_Fetch = parse_cli_option_p($Opt{'p'});
156: print STDERR "Will fetch pages: ", join(',', @_Pages_To_Fetch), "\n" if $DEBUG;
157: ## this option override '-e' (reset $_Page_Limit to unlimit)
158: $_Page_Limit = 0;
159: }
160:
161: while ( my $raw_html = mcas_get_next_page(shift(@_Pages_To_Fetch)) ) {
162:
163: ## if there are more pages to fetch, reset $_On_Last_Page flag set
164: ## by mcas_get_next_page()
165: if ( @_Pages_To_Fetch ) {
166: $_On_Last_Page = 0;
167: }
168:
169: ## new HTML::TreeBuilder object
170: my $root = HTML::TreeBuilder->new();
171:
172: ## parse the raw HTML returned from mcas_get_next_page. move on
173: ## if parse() failed
174: $root->parse_content($raw_html) or next;
175:
176: ##
177: ## all the questions are inside the 2nd <FORM> node
178: ##
179:
180: my $form = ($root->find('form'))[1];
181:
182: ## just making sure we found that <FORM> node. move on if not
183: if ( ! $form or ! ref($form) ) {
184: next;
185: }
186:
187: ## detach the <FORM> node found from the initial <HTML> documenent
188: ## tree, $root, so we could get rid of $root to conserve memory
189: $form->detach;
190: $root->delete;
191:
192: ## all questions found on this page will be stored in this array
193: my @question_nodes = ();
194:
195: ## get only the <TABLE> nodes (that's where all questions are)
196: @question_nodes = content_list_by_tag_name($form, 'table');
197:
198: ## remove the first 2 <TABLE> nodes since they are the search and
199: ## navigation/display boxes and not the actual questions.
200: for (my $i=0; $i<2; $i++) {
201: shift @question_nodes;
202: }
203: ## and the last empty <TABLE> (spacer)
204: pop @question_nodes;
205:
206: foreach my $node (@question_nodes) {
207: my $q_info = mcas_parse_question($node);
208: print STDERR "=============================================\n" if $DEBUG;
209: print STDERR join("\n", map{ $_ . ' => "'. $q_info->{$_} . '"' } keys %{$q_info}), "\n" if $DEBUG > 5;
210: print STDERR "Children nodes: ", scalar($node->content_list()), "\n" if $DEBUG > 5;
211: }
212:
213: ## cleanup
214: @question_nodes = ();
215: $form->delete();
216:
217: }
218:
219: exit;
220:
221: ########################################################################
222: ## SUBROUTINES
223: ########################################################################
224:
225: sub usage
226: {
227: my $progname = basename($0);
228:
229: print <<_EOH_
230: Usage: $progname [[-p <page#>,<page#>,...] | [-s <page#> [-e <page#>]]]
231: [-y <year_code>] [-g <grade_code>] [-c <category_code>]
232: $progname -h
233:
234: -p <page#>
235: Comma separated list of page number to retrieve. Page number can
236: be a range, i.e. 10-14). (default is all pages available)
237: -s <page#>
238: Retrieve questions starting on page <page#>.
239: -e <page#>
240: Stop at page <page#>.
241: -y <year_code>
242: Only retrieve question of year <year_code>. (default is '2007')
243: -g <grade_code>
244: Only retrieve questions for grade level <grade_code>.
245: (default 'all')
246: -c <category_code>
247: Only retrieve questions in category <category_code>. (default 'all')
248: -h Print this help page.
249:
250: NOTE: -p option supercedes -s and -e
251: _EOH_
252: ;
253:
254: }
255:
256: ##################################################
257: ## parse out pages we want to fetch (passed in from command line
258: ## option)
259: ##
260: sub parse_cli_option_p
261: {
262: my $opt_str = shift;
263: my @opts = ();
264:
265: if ( $opt_str ) {
266: ## split by commas
267: my @arr = split(/\,/, $opt_str);
268:
269: ## go through each part and do further spliting
270: ## cuz page number could be a range, '6-8' for example.
271: foreach my $elem (@arr) {
272: if ( $elem =~ /^(\d+)\-(\d+)$/ ) {
273: for (my $i=$1; $i<=$2; $i++) {
274: push(@opts, $i);
275: }
276: }
277: elsif ( $elem =~ /^\d+$/ ) {
278: push(@opts, $elem);
279: }
280: }
281: }
282: return @opts;
283: }
284: ##################################################
285: ## Initializing the $UA LWP::UserAgent object
286: ##
287: sub init_ua
288: {
289: if ( $UA and ref($UA) eq 'LWP::UserAgent' ) {
290: ## $UA already init'ed
291: return $UA;
292: }
293:
294: print STDERR "Initializing LWP::UserAgent object\n" if $DEBUG;
295: $UA = LWP::UserAgent->new(
296: ## quit if we can't establish connection after 10 seconds
297: timeout => 10,
298:
299: ## User-agent string (faking firefox 1.0.8 on XP)
300: agent => 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.7.13) Gecko/20060410 Firefox/1.0.8',
301:
302: ## cookie jar
303: cookie_jar => {}
304: );
305:
306: return $UA;
307: }
308:
309: ##################################################
310: ## get the next page from MCAS Search interface
311: ##
312: sub mcas_get_next_page
313: {
314: my $page_no = shift; # optional page number
315:
316: if ( $_On_Last_Page ) {
317: print STDERR "Already fetched the last page. I'm stopping here.\n";
318: return undef;
319: }
320:
321: ## do not go past the page number specified by '-e'
322: if ( $_Page_Limit and ($MCAS_URL_Params->{page} > $_Page_Limit) ) {
323: print STDERR "Maximum page number reached ($_Page_Limit).\n";
324: return undef;
325: }
326:
327: ## runaway script?????
328: if ( $_Fetch_Counter >= $Max_Requests ) {
329: print STDERR "Maximum DOE search request reached ($Max_Requests).\n";
330: return undef;
331: }
332:
333: ##
334: ## ok, fetch the next page...
335: ##
336:
337: ## if $page_no is provided, fetch that one instead of current
338: ## value in $MCAS_URL_Params->{page}. Also, set $_On_Last_Page so
339: ## next call won't happen
340: if ( $page_no ) {
341: $MCAS_URL_Params->{page} = $page_no;
342: $_On_Last_Page = 1;
343: }
344:
345: ## build the URL query part from $MCAS_URL_Params
346: ## (i.e. name1=value1&name2=value2 and so on)
347: my $url_query = join('&', map { $_ . '=' . $MCAS_URL_Params->{$_} } keys(%{$MCAS_URL_Params}));
348:
349:
350: ## initialize LWP::UserAgent
351: init_ua();
352:
353: print STDERR "Fetching page ", $MCAS_URL_Params->{page}, " from MCAS site" if $DEBUG;
354: print STDERR " ${MCAS_URL}?${url_query}\n" if $DEBUG > 3;
355: my $resp = $UA->get("${MCAS_URL}?${url_query}");
356:
357: ## increment $_Fetch_Counter
358: $_Fetch_Counter++;
359:
360: if ( $resp->is_success ) {
361: print STDERR ": ", $resp->status_line, "\n" if $DEBUG;
362: my $content = $resp->content;
363:
364: ## increment page number
365: $MCAS_URL_Params->{page}++;
366:
367: ##
368: ## initialize global vars if they haven't been init'ed
369: ##
370:
371: ## get the total questions found
372: if ( ! $Total_Questions ) {
373: ## should use index & substr instead of the could-be-slow-ass
374: ## regex, but what the heck...
375: if ( $content =~ /Showing +\d+ +to +\d+ +of +(\d+)/ ) {
376: $Total_Questions = $1;
377: }
378: }
379:
380: ## another safe-guard feature... if there is a next page to fetch,
381: ## the 'Next' in the page should be enclosed in an <a> tag. If
382: ## that's not the case then we should prevent the next call to
383: ## mcas_get_next_page() from happening
384: if ( index($content, 'Next</a>') < 0 ) {
385: $_On_Last_Page = 1;
386: }
387:
388: return $content;
389: }
390: else {
391: print STDERR ": ", $resp->status_line, "\n";
392: }
393:
394: return undef;
395: }
396:
397: ##################################################
398: ## similar to HTML::Element::content_list() but only return
399: ## nodes that has certain tagname that we want
400: ##
401: sub content_list_by_tag_name
402: {
403: my $root = shift;
404: my @tags = @_;
405: my @nodes;
406:
407: return undef if ! $root;
408: if ( ref($root) ) {
409: foreach my $node ($root->content_list()) {
410: if ( ref($node) ) {
411: if ( @tags ) {
412: ## only look for tags that we wants
413: foreach my $wanted_tag (@tags) {
414: if ( $node->tag eq lc($wanted_tag) ) {
415: push(@nodes, $node);
416: last;
417: }
418: }
419: }
420: else {
421: ## get everything
422: push(@nodes, $node);
423: }
424: }
425: }
426: }
427: return @nodes;
428: }
429:
430: ##################################################
431: ## parse/extract info from the HTML::Element which has the MCAS question
432: ## data.
433: ##
434: sub mcas_parse_question
435: {
436: my $node = shift;
437: my $result = {};
438:
439: return undef if ! $node or ! ref($node);
440:
441: ## we know from the HTML source that each question is enclosed
442: ## in a <TABLE>. Basically, the structure of the question (tree) is:
443: ##
444: ## <table>
445: ## <tbody>
446: ## <tr> => the year, , grade, question#, question type, strand
447: ## <tr> => standard
448: ## <tr> ... so on
449: ##
450:
451: if ( $node->tag eq 'table' ) {
452: my @q_parts = content_list_by_tag_name($node, 'tr');
453:
454: if ( @q_parts ) {
455: $result = mcas_parse_question_metadata(shift @q_parts);
456: $result->{standard} = mcas_parse_question_standard(shift @q_parts);
457: }
458: $result->{raw_html} = $node->as_HTML;
459: }
460:
461: return $result;
462: }
463:
464: ##################################################
465: ## parse the meta info from the question node/block
466: ## (the first <TR>)
467: ## return a hash ref with year,subject,grade,q#,q_type,strand
468: ##
469: sub mcas_parse_question_metadata
470: {
471: my $node = shift;
472: my $result = {};
473:
474: return undef if ! $node or ! ref($node);
475:
476: if ( $node->tag eq 'tr' ) {
477: ## colapse this node into raw HTML string which will be
478: ## easier to parse
479: my $node_str = $node->as_HTML;
480:
481: ## split into 3 element (1st, 2nd, 3rd line as appear in browser)
482: my @lines = split(/\<br\s*\/\>/, $node_str);
483:
484: ## line 1 looks like this:
485: ## <tr><td class="lg" colspan="2" width="550">2007, Biology - High School
486: if ( $lines[0] =~ /\> *(\d+) *, +(.*) +\- +(.*) *$/ ) {
487: $result->{year} = $1;
488: $result->{subject} = $2;
489: $result->{grade} = $3;
490: }
491:
492: ## line 2 looks like this:
493: ## Question 7: Multiple-Choice
494: if ( $lines[1] =~ /Question +(\d+): +(.*) *$/ ) {
495: $result->{question_number} = $1;
496: $result->{question_type} = $2;
497: }
498: else {
499: ## some question doesn't have "Question #:"
500: ## (first question in '2007', 'All', '-- ENGLISH LANGUAGE ARTS --')
501: $result->{question_type} = $lines[1];
502: }
503:
504: ## line 3 looks like this:
505: ## <span class="nav em">Reporting Category: Biochemistry and Cell Biology</span> <span class="nav em"></span></td></tr>
506: if ( $lines[2] =~ /Category: +([^\<]+)\</ ) {
507: $result->{strand} = $1;
508: }
509: }
510:
511: return $result;
512: }
513:
514: ##################################################
515: ## parse the meta info from the question node/block
516: ## (the 2nd <TR>)
517: ## return string (question's Standard)
518: ##
519: sub mcas_parse_question_standard
520: {
521: my $node = shift;
522: my $result = '';
523:
524: return undef if ! $node or ! ref($node);
525:
526: if ( $node->tag eq 'tr' ) {
527: ## colapse this node into raw HTML string which will be
528: ## easier to parse
529: my $node_str = $node->as_HTML;
530:
531: ## line should looks like this:
532: ## <tr><td class="nav em" colspan="2">Standard: Structure and Origins of Modern English - 5</td><td> </td></tr>
533: if ( $node_str =~ /\>Standard: +([^\<]+)\</ ) {
534: $result = $1;
535: }
536: }
537:
538: return $result;
539: }