#!/usr/bin/perl use strict; use warnings; use Data::Dumper; use Switch; ### helper functions sub hash_to_string { my %results_row = @_; my $str = join('|', %results_row); %results_row = (); return $str; } sub string_to_hash { my $str = shift; my @vals = split(/\|/, $str); my %results_row; for (my $i = 0; $i < $#vals; $i += 2) { $results_row{$vals[$i]} = $vals[$i + 1]; } return %results_row; } sub read_results_file { my $filename = shift; my @rows; open (RESULTS_FILE, ") { chomp $row; # make sure there's no newline at the end push (@rows, $row); } close (RESULTS_FILE); return @rows; } sub month_str { my $month_num = shift; switch ($month_num) { case 0 {return "January"; } case 1 {return "February"; } case 2 {return "March"; } case 3 {return "April"; } case 4 {return "May"; } case 5 {return "June"; } case 6 {return "July"; } case 7 {return "August"; } case 8 {return "September"; } case 9 {return "October"; } case 10 {return "November"; } case 11 {return "December"; } } } sub html_link { my $entity = shift; # get rid of 'Special:URIResolver' part $entity =~ s/Special:URIResolver\///g; # for some reason, all '%' characters get changed to '-', and '-'s get # changed to '-2D', sometime during # the RDF export or SPARQL querying process - change them back $entity =~ s/-/\%/g; $entity =~ s/\%2D/-/g; # for some reason, hex-unencoding isn't working on this one character $entity =~ s/\%E2\%80\%93/–/g; # now hexadecimal-unencode the string - code obtained from # http://support.internetconnection.net/CODE_LIBRARY/Perl_URL_Encode_and_Decode.shtml $entity =~ s/\%([A-Fa-f0-9]{2})/pack('C', hex($1))/seg; # the actual text is just the URL without the domain name, etc., and # with underscores replaced by spaces my $text = substr($entity, 28); $text =~ s/_/ /g; return "$text"; } sub html_topic { my $text = shift; return "" . $text . ": "; } sub html_data_table { my $num_rows = shift; my $num_columns = shift; my @column_headers; for (my $i = 0; $i < $num_columns; $i++) { push (@column_headers, shift); } my @data = @_; my $html = "\n"; $html .= "\n\n"; foreach my $header (@column_headers) { $html .= "\n"; } $html .= "\n\n"; $html .= "\n"; for (my $row = 0; $row < $num_rows; $row++) { $html .= "\n"; for (my $col = 0; $col < $num_columns; $col++) { $html .= "\n"; } $html .= "\n"; } $html .= "\n
$header
"; # create link if entity is a URL if ($data[$row][$col] =~ /http/) { $html .= &html_link($data[$row][$col]); } else { $html .= $data[$row][$col]; } $html .= "
\n"; return $html; } my $FOR_VALUE = 1; my $MIXED_VALUE = 2; my $AGAINST_VALUE = 4; sub stance_int_to_str { my $stance_int = shift; my $str = ""; if ($stance_int & $FOR_VALUE) { $str .= "for"; } if ($stance_int & $MIXED_VALUE) { if ($str ne "") {$str .= " & ";} $str .= "mixed"; } if ($stance_int & $AGAINST_VALUE) { if ($str ne "") {$str .= " & ";} $str .= "against"; } return $str; } ### MAIN ### Get contents of query-results files my @topic_rows = &read_results_file("topic"); my @is_for_rows = &read_results_file("is_for"); my @is_against_rows = &read_results_file("is_against"); my @is_mixed_rows = &read_results_file("is_mixed"); my @author_rows = &read_results_file("author"); ### Create auxiliary data needed for data processing # %items_per_topic - hash of arrays - all items per each topic my %items_per_topic = (); foreach my $topic_row (@topic_rows) { my %cur_hash = &string_to_hash($topic_row); my $cur_topic = $cur_hash{'topic'}; my $cur_item = $cur_hash{'item'}; if (! exists $items_per_topic{$cur_topic}) { $items_per_topic{$cur_topic} = (); } push @{$items_per_topic{$cur_topic}}, $cur_item; } # %items_per_opinion - two-dimensional hash of arrays - # all items per combination of position and stance my %items_per_opinion = (); foreach my $is_for_row (@is_for_rows) { my %cur_hash = &string_to_hash($is_for_row); my $cur_position = $cur_hash{'position'}; my $cur_item = $cur_hash{'item'}; if (! exists $items_per_opinion{$cur_position}{'for'}) { $items_per_opinion{$cur_position}{'for'} = (); } push @{$items_per_opinion{$cur_position}{'for'}}, $cur_item; } foreach my $is_mixed_row (@is_mixed_rows) { my %cur_hash = &string_to_hash($is_mixed_row); my $cur_position = $cur_hash{'position'}; my $cur_item = $cur_hash{'item'}; if (! exists $items_per_opinion{$cur_position}{'mixed'}) { $items_per_opinion{$cur_position}{'mixed'} = (); } push @{$items_per_opinion{$cur_position}{'mixed'}}, $cur_item; } foreach my $is_against_row (@is_against_rows) { my %cur_hash = &string_to_hash($is_against_row); my $cur_position = $cur_hash{'position'}; my $cur_item = $cur_hash{'item'}; if (! exists $items_per_opinion{$cur_position}{'against'}) { $items_per_opinion{$cur_position}{'against'} = (); } push @{$items_per_opinion{$cur_position}{'against'}}, $cur_item; } # %authors - hash of arrays - all authors per item # %items_per_author - hash of arrays - all items per author my %authors = (); my %items_per_author = (); foreach my $author_row (@author_rows) { my %cur_hash = &string_to_hash($author_row); my $cur_item = $cur_hash{'item'}; my $cur_author = $cur_hash{'author'}; if (! exists $authors{$cur_item}) { $authors{$cur_item} = (); } if (! exists $items_per_author{$cur_author}) { $items_per_author{$cur_author} = (); } push @{$authors{$cur_item}}, $cur_author; push @{$items_per_author{$cur_author}}, $cur_item; } ### Get data for analysis page ## Most popular topics my @num_items_per_topic; foreach my $topic (keys %items_per_topic) { my $num_items = $#{ $items_per_topic{$topic} } + 1; my @cur_pair = [$topic, $num_items]; push (@num_items_per_topic, @cur_pair); } @num_items_per_topic = sort {$b->[1] <=> $a->[1]} @num_items_per_topic; #print "The most popular topic is " . $num_items_per_topic[0][0] . # ", with " . $num_items_per_topic[0][1] . " items about it.\n"; #print "The next topic is " . $num_items_per_topic[1][0] . # ", with " . $num_items_per_topic[1][1] . " items about it.\n"; ## Most popular position and opinion my @num_items_per_position; my @num_items_per_opinion; my @controversiality_per_position; my @cur_position; my @cur_opinion; my @cur_controversiality; foreach my $position (keys %items_per_opinion) { my $num_items_for = $#{ $items_per_opinion{$position}{'for'} } + 1; @cur_opinion = [$position, 'for', $num_items_for]; push (@num_items_per_opinion, @cur_opinion); my $num_items_mixed = $#{ $items_per_opinion{$position}{'mixed'} } + 1; @cur_opinion = [$position, 'mixed', $num_items_mixed]; push (@num_items_per_opinion, @cur_opinion); my $num_items_against = $#{ $items_per_opinion{$position}{'against'} } + 1; @cur_opinion = [$position, 'against', $num_items_against]; push (@num_items_per_opinion, @cur_opinion); my $total_items = $num_items_for + $num_items_mixed + $num_items_against; @cur_position = [$position, $total_items]; push (@num_items_per_position, @cur_position); # add to the controversiality array only if there are at least 5 items if ($total_items >= 5) { my $most_items_for_opinion = $num_items_for; if ($num_items_mixed > $most_items_for_opinion) { $most_items_for_opinion = $num_items_mixed; } if ($num_items_against > $most_items_for_opinion) { $most_items_for_opinion = $num_items_against; } # add 1 to the denominator to break the tie between different unanimous # opinions, 'rewarding' those with more items @cur_controversiality = [$position, $num_items_for, $num_items_mixed, $num_items_against, (($most_items_for_opinion + 1) / ($total_items + 2))]; push (@controversiality_per_position, @cur_controversiality); } } @num_items_per_position = sort {$b->[1] <=> $a->[1]} @num_items_per_position; @num_items_per_opinion = sort {$b->[2] <=> $a->[2]} @num_items_per_opinion; #print "The most popular position is $most_popular_position, with $most_items_for_position items about it.\n"; #print "The most popular opinion is $most_popular_opinion[0], with $most_items_for_opinion items espousing it.\n"; ## Most frequent author my @num_items_per_single_author; my @num_items_per_editorial_board; my @cur_author; foreach my $author (keys %items_per_author) { my $num_items = $#{ $items_per_author{$author} } + 1; if ($author =~ /editorial_board/) { @cur_author = [$author, $num_items]; push (@num_items_per_editorial_board, @cur_author); } else { @cur_author = [$author, $num_items]; push (@num_items_per_single_author, @cur_author); } } @num_items_per_single_author = sort {$b->[1] <=> $a->[1]} @num_items_per_single_author; @num_items_per_editorial_board = sort {$b->[1] <=> $a->[1]} @num_items_per_editorial_board; #print "The most frequent individual author is $most_frequent_single_author, with $most_items_for_single_author items written.\n"; #print "The most frequent editorial board is $most_frequent_ed_board, with $most_items_for_ed_board items written.\n"; ## Authors most in agreement with and in opposition to one another # %stances_held - two-dimensional hash - holds stances held, in a binary # format, for each combination of position and author my %stances_held = (); foreach my $position (keys %items_per_opinion) { foreach my $for_item (@{ $items_per_opinion{$position}{'for'} }) { foreach my $author (@{ $authors{$for_item} }) { if (! exists $stances_held{$position}{$author}) { $stances_held{$position}{$author} = 0; } if (! ($stances_held{$position}{$author} & $FOR_VALUE)) { $stances_held{$position}{$author} += $FOR_VALUE; } #print "added stance - $position, $author, $FOR_VALUE\n"; } } foreach my $mixed_item (@{ $items_per_opinion{$position}{'mixed'} }) { foreach my $author (@{ $authors{$mixed_item} }) { if (! exists $stances_held{$position}{$author}) { $stances_held{$position}{$author} = 0; } if (! ($stances_held{$position}{$author} & $MIXED_VALUE)) { $stances_held{$position}{$author} += $MIXED_VALUE; } #print "added stance - $position, $author, $MIXED_VALUE\n"; } } foreach my $against_item (@{ $items_per_opinion{$position}{'against'} }) { foreach my $author (@{ $authors{$against_item} }) { if (! exists $stances_held{$position}{$author}) { $stances_held{$position}{$author} = 0; } if (! ($stances_held{$position}{$author} & $AGAINST_VALUE)) { $stances_held{$position}{$author} += $AGAINST_VALUE; } #print "added stance - $position, $author, $AGAINST_VALUE\n"; } } } # %agreements - two-dimensional hash of arrays - holds all opinions in # common between two authors, with author alphabetically first in first index # %disagreements - two-dimensional hash of arrays - holds all opinions in # conflict between two authors, with author alphabetically first in first index my %agreements = (); my %disagreements = (); foreach my $position (keys %stances_held) { my @authors_on_position = keys %{$stances_held{$position}}; #print "authors on position $position: @authors_on_position\n"; foreach my $author1 (@authors_on_position) { foreach my $author2 (@authors_on_position) { if (($author1 lt $author2) && (exists $stances_held{$position}{$author1}) && (exists $stances_held{$position}{$author1})) { # this needs to get more sophisticated if ($stances_held{$position}{$author1} == $stances_held{$position}{$author2}) { if (! exists $agreements{$author1}{$author2}) { $agreements{$author1}{$author2} = (); } push @{$agreements{$author1}{$author2}}, $position; } else { if (! exists $disagreements{$author1}{$author2}) { $disagreements{$author1}{$author2} = (); } push @{$disagreements{$author1}{$author2}}, $position; } } } } } # now, get the most agreements and the most disagreements my $highest_num_agreements = 0; my $most_agreements_author1 = ""; my $most_agreements_author2 = ""; my $highest_num_agreements_individ = 0; my $most_agreements_individ1 = ""; my $most_agreements_individ2 = ""; foreach my $author1 (keys %agreements) { foreach my $author2 (keys %{ $agreements{$author1} }) { my $num_agreements = $#{ $agreements{$author1}{$author2} } + 1; if ($num_agreements > $highest_num_agreements) { $highest_num_agreements = $num_agreements; $most_agreements_author1 = $author1; $most_agreements_author2 = $author2; } if ((! ($author1 =~ /editorial_board/)) && (! ($author2 =~ /editorial_board/))) { if ($num_agreements > $highest_num_agreements_individ) { $highest_num_agreements_individ = $num_agreements; $most_agreements_individ1 = $author1; $most_agreements_individ2 = $author2; } } } } #print "The two authors who agree on the most opinions are $most_agreements_author1 and $most_agreements_author2, with $highest_num_agreements agreements.\n"; #print "The two individual authors who agree on the most opinions are $most_agreements_individ1 and $most_agreements_individ2, with $highest_num_agreements_individ agreements.\n"; my $highest_num_disagreements = 0; my $most_disagreements_author1 = ""; my $most_disagreements_author2 = ""; my $highest_num_disagreements_individ = 0; my $most_disagreements_individ1 = ""; my $most_disagreements_individ2 = ""; foreach my $author1 (keys %disagreements) { foreach my $author2 (keys %{ $disagreements{$author1} }) { my $num_disagreements = $#{ $disagreements{$author1}{$author2} } + 1; if ($num_disagreements > $highest_num_disagreements) { $highest_num_disagreements = $num_disagreements; $most_disagreements_author1 = $author1; $most_disagreements_author2 = $author2; } if ((! ($author1 =~ /editorial_board/)) && (! ($author2 =~ /editorial_board/))) { if ($num_disagreements > $highest_num_disagreements_individ) { $highest_num_disagreements_individ = $num_disagreements; $most_disagreements_individ1 = $author1; $most_disagreements_individ2 = $author2; } } } } #print "The two authors who disagree on the most opinions are $most_disagreements_author1 and $most_disagreements_author2, with $highest_num_disagreements disagreements.\n"; #print "The two individual authors who disagree on the most opinions are $most_disagreements_individ1 and $most_disagreements_individ2, with $highest_num_disagreements_individ disagreements.\n"; ### Create results file open (HTML_FILE, ">results.html"); my $html_header = <<"END"; Discourse DB Analysis page

Discourse DB Analysis

The following information was obtained by making queries on the RDF data produced by Discourse DB (you can see the RDF data here), and performing further computation on the results. The entire operation was performed using Perl. The action is done by two Perl scripts: the first gets the necessary raw data from the RDF file through SPARQL queries, using the RDF::Query library, and saves it into several text files; the second reads in the contents of those text files, does the necessary computation and and analysis on the data, and generates the HTML file you're reading now (including this text). You can see copies of these two scripts here and here, respectively. The decision was made to have Perl do the heavy lifting, instead of getting these values directly through complex SPARQL queries, because SPARQL, as flexible as it is, is also much slower and more resource-intensive than a regular Perl script. However's SPARQL's role in retrieving the data is invaluable: although this page exists on the discoursedb.org domain, the queries and analysis could have been run from anywhere in the world.

A short introduction to the Discourse DB naming system: a topic is a political issue, such as a bill, law or military conflict. A position is a possible course of action relating to a topic. A stance is a view on a position: either "for", "against" or "mixed". An opinion is any one combination of a position and a stance. Finally, an item is an editorial, column or post published in a reputable newspaper, magazine or blog.

All information is specific to Discourse DB. Data was last updated on END my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time); my $cur_date = &month_str($mon) . sprintf(" %d, %d", $mday, $year + 1900); $html_header .= "$cur_date.

\n"; $html_header .= <<"END";

- Yaron Koren


END my $html_middle = ""; $html_middle .= "

" . &html_topic("Most-written-about topics") . "

"; $html_middle .= &html_data_table(5, 2, "Topic", "Number of items", @num_items_per_topic); $html_middle .= "

" . &html_topic("Most-written-about positions") . "

"; $html_middle .= &html_data_table(5, 2, "Position", "Number of items", @num_items_per_position); $html_middle .= "

" . &html_topic("Most popular opinions") . "

"; $html_middle .= &html_data_table(5, 3, "Position", "Stance", "Number of items", @num_items_per_opinion); @controversiality_per_position = sort {$a->[4] <=> $b->[4]} @controversiality_per_position; $html_middle .= "

" . &html_topic("Most controversial positions") . "

"; $html_middle .= &html_data_table(5, 4, "Position", "# for", "# mixed", "# against", @controversiality_per_position); @controversiality_per_position = sort {$b->[4] <=> $a->[4]} @controversiality_per_position; $html_middle .= "

" . &html_topic("Least controversial positions") . "

"; $html_middle .= &html_data_table(5, 4, "Position", "# for", "# mixed", "# against", @controversiality_per_position); $html_middle .= "

" . &html_topic("Most-frequently-appearing individual authors") . "

"; $html_middle .= &html_data_table(5, 2, "Author", "Number of items", @num_items_per_single_author); $html_middle .= "

" . &html_topic("Most-frequently-appearing editorial boards") . "

"; $html_middle .= &html_data_table(5, 2, "Author", "Number of items", @num_items_per_editorial_board); $html_middle .= "

" . &html_topic("Most-aligned authors") . "The two authors who agree on the most opinions are " . &html_link($most_agreements_author1) . " and " . &html_link($most_agreements_author2) . ", with $highest_num_agreements opinions in common, on the following positions:

\n"; $html_middle .= "
    \n"; my @sorted_positions = sort (@{ $agreements{$most_agreements_author1}{$most_agreements_author2} }); foreach my $position (@sorted_positions) { $html_middle .= "
  • " . &html_link($position) . " (" . &stance_int_to_str($stances_held{$position}{$most_agreements_author1}) . ")\n"; } $html_middle .= "
\n"; $html_middle .= "

" . &html_topic("Most-aligned individual authors") . "The two individual authors who agree on the most opinions are " . &html_link($most_agreements_individ1) . " and " . &html_link($most_agreements_individ2) . ", with $highest_num_agreements_individ opinions in common, on the following positions:

\n"; $html_middle .= "
    \n"; @sorted_positions = sort (@{ $agreements{$most_agreements_individ1}{$most_agreements_individ2} }); foreach my $position (@sorted_positions) { $html_middle .= "
  • " . &html_link($position) . " (" . &stance_int_to_str($stances_held{$position}{$most_agreements_individ1}) . ")\n"; } $html_middle .= "
\n"; $html_middle .= "

" . &html_topic("Most-at-odds authors") . "The two authors who disagree on the most opinions are " . &html_link($most_disagreements_author1) . " and " . &html_link($most_disagreements_author2) . ", with $highest_num_disagreements divergent opinions, on the following positions:

\n"; $html_middle .= "
    \n"; @sorted_positions = sort (@{ $disagreements{$most_disagreements_author1}{$most_disagreements_author2} }); foreach my $position (@sorted_positions) { $html_middle .= "
  • " . &html_link($position) . " (" . &stance_int_to_str($stances_held{$position}{$most_disagreements_author1}) . ", " . &stance_int_to_str($stances_held{$position}{$most_disagreements_author2}) . ")\n"; } $html_middle .= "
\n"; $html_middle .= "

" . &html_topic("Most-at-odds individual authors") . "The two individual authors who disagree on the most opinions are " . &html_link($most_disagreements_individ1) . " and " . &html_link($most_disagreements_individ2) . ", with $highest_num_disagreements_individ divergent opinions, on the following positions:

\n"; $html_middle .= "
    \n"; @sorted_positions = sort (@{ $disagreements{$most_disagreements_individ1}{$most_disagreements_individ2} }); foreach my $position (@sorted_positions) { $html_middle .= "
  • " . &html_link($position) . " (" . &stance_int_to_str($stances_held{$position}{$most_disagreements_individ1}) . ", " . &stance_int_to_str($stances_held{$position}{$most_disagreements_individ2}) . ")\n"; } $html_middle .= "
\n"; my $html_footer = <<"END";
END print HTML_FILE $html_header; print HTML_FILE $html_middle; print HTML_FILE $html_footer; close(HTML_FILE);