User:Slaporte/US Code

The US Code Formatting Tool will process a formatted ascii file of US Code.
Version history

8/16/2010 - fixed revisionIndex and related functions; improved documentation
8/14/2010 - Improved documentation; renamed functions & variables;
8/12/2010 - New code with docs
7/6/2010
This processes the HTML files downloadable from FDSys.
7/1/2010 - Demo code
Code

<?php

# This is a tool for batch processing the US Code, as
# downloaded from FDSys <http://www.gpo.gov/fdsys/>
#
# Last updated 8/16/2010

/**
* Split the bot file after it exceeds this size
*/
$splitSize = 500000;

/**
* Determines if a interger is odd.
*
*/
function is_odd($int){
  return($int & 1);
}

/**
* Gather defined data from comments in html. 
*
*  An html file from FDSys <http://www.gpo.gov/fdsys/> has
*  some data in comments, such as <!-- AUTHORITIES-PUBLICATION-YEAR:2010 -->,
*  at the start of the file.
* 
* @param $txt string Raw text from FDSys html file
* @return array
*   [0] => AUTHORITIES-PUBLICATION-NAME
*   [1] => AUTHORITIES-PUBLICATION-ID
*   [2] => AUTHORITIES-PUBLICATION-YEAR
*   [3] => AUTHORITIES-LAWS-ENACTED-THROUGH-DATE
*   [4] => SEARCHABLE-LAWS-ENACTED-THROUGH-DATE
*   [5] => AUTHORITIES-USC-TITLE-NAME
*   [6] => AUTHORITIES-USC-TITLE-ENUM
*   [7] => AUTHORITIES-USC-TITLE-STATUS
*   [8] => CONVERSION-PROGRAM
*   [9] => CONVERSION-DATETIME
*/
function collectPublicationData($txt){
	if(preg_match("/<!-- AUTHORITIES-PUBLICATION-NAME:(.*) -->/",$txt,$name)){
		$AuthPubName= $name[1];
	}
	if(preg_match("/<!-- AUTHORITIES-PUBLICATION-ID:(.*) -->/",$txt,$id)){
		$AuthPubId = $id[1];
	}
	if(preg_match("/<!-- AUTHORITIES-PUBLICATION-YEAR:(.*) -->/",$txt,$pub)){
		$AuthPubYear = $pub[1];
	}
	if(preg_match("/<!-- AUTHORITIES-LAWS-ENACTED-THROUGH-DATE:(.*) -->/",$txt,$authEnacted)){
		$AuthLawsEnacted = $authEnacted[1];
	}
	if(preg_match("/<!-- SEARCHABLE-LAWS-ENACTED-THROUGH-DATE:(.*) -->/",$txt,$searchEnacted)){
		$SearchLawsEnacted = $searchEnacted[1];
	}
	if(preg_match("/<!-- AUTHORITIES-USC-TITLE-NAME:(.*) -->/",$txt,$name)){
		$AuthName = $name[1];
	}
	if(preg_match("/<!-- AUTHORITIES-USC-TITLE-ENUM:(.*) -->/",$txt,$enum)){
		$AuthEnum = $enum[1];
	}
	if(preg_match("/<!-- AUTHORITIES-USC-TITLE-STATUS:(.*) -->/",$txt,$status)){
		$AuthStatus = $status[1];
	}
	if(preg_match("/<!-- CONVERSION-PROGRAM:(.*) -->/",$txt,$conv)){
		$ConvProgram = $conv[1];
	}
	if(preg_match("/<!-- CONVERSION-DATETIME:(.*) -->/",$txt,$date)){
		$ConvDate = $date[1];
	}
	
	return array($AuthPubName, $AuthPubId, $AuthPubYear, $AuthLawsEnacted, $SearchLawsEnacted, $AuthName, $AuthEnum, $AuthStatus, $ConvProgram, $ConvDate);
}

/**
* Note on loadHTML()--
* Markup errors in loadHTML() are supressed.  We can still
* process invalid markup. If necessary, use libxml_use_internal_errors(true) and 
* libxml_get_errors() to handle markup errors.
*/

/**
* Find the last listed statute at large. It is useful to 
* find the key, for use with printRevisionIndex()
*
*
* @return string The name of the statue last listed statute at large. .
*/
function determineRevisionKey($txt){
	$doc = new DOMDocument();
	@$doc->loadHTML($txt);
	$xpath = new DOMXPath($doc);

	$query = '//p';

	$items = $xpath->query($query);
	if(is_object($items->item(0))){
		if(preg_match_all("/([0-9]+ Stat\. [0-9]+)/",$items->item(0)->nodeValue,$stat)){
			$recentamendment = end($stat[1]);
		}else{
			$recentamendment = "none";
		}
	}
	
	unset($doc);
	unset($xpath);	
	return $recentamendment;
}

/**
* Print the statute => revision pairs in a format useful for compareAmendments()
*
* @param $keys array Formatted statute => revision
*/
function printRevisionIndex($keys){
	print '$revisionIndex = array('."\n";
	foreach($keys as $name=>$revision){
		print "\"".$name."\" => \"".$revision."\",\n";
	}
	print ");";
}

/**
* Extract content from <h3> element
* 
* @param $txt string Text in html
* @return string Content of <h3> element
*/
function getFieldHeadThree($txt){
	$doc = new DOMDocument();
	@$doc->loadHTML($txt);
	$xpath = new DOMXPath($doc);

	$query = '//h3';

	if(is_object($xpath->query($query)->item(0))){
		$result = $xpath->query($query)->item(0)->nodeValue;
		if(isset($result)){
			$result = "\n\n==".$result."==\n\n";
		}
	} else {
		$result = "";
	}
	return $result;
}	

/**
* Extract content from <h4> element
*
* @param $txt string Text in html
* @return string Content of <h4> element
*/
function getFieldHeadFour($txt){
	$doc = new DOMDocument();
	@$doc->loadHTML($txt);
	$xpath = new DOMXPath($doc);
	
	$query = '//h4';
	if(is_object($xpath->query($query)->item(0))){
		$result = $xpath->query($query)->item(0)->nodeValue;
		if(isset($result)){
			$result = "\n\n===".$result."===\n\n";
		}
	}
	if(!isset($result)){
		$result = "";
	}
	return $result;
}	

/**
* Apply wikimarkup to indent each paragraph
* 
* Indents according to the class of each <p> element. Each em specified in "statutory-body-Xem" will get one
* semicolon indent. Handles 9 indents. 
*
* @param $txt string Text in html, with FDSys classes
* @return string Indented paragraphs, no longer in html
*/
function indentStatute($txt){
	$doc = new DOMDocument();
	@$doc->loadHTML($txt);
	$xpath = new DOMXPath($doc);

	$query = '//p';

	$items = $xpath->query($query);

	foreach($items as $item){
		if($item->getAttribute('class') == "statutory-body"){
			$indentNumber = "";
		}else{
			if(preg_match("/statutory-body-([0-9]+)em/",$item->getAttribute('class'),$match)){
				$indentNumber = $match[1];
			}
		}
		if(!isset($indent)){ $indent = ""; }
		if(!isset($indentNumber)) { $indentNumber = 0;}
		
		if($indentNumber != 0){
			$indent = str_pad($indent, $indentNumber, ":");
		}
		if(is_object($item)){
			if(isset($text)){
				$text = $text.$indent.$item->nodeValue."\n \n";
			}else{
				$text = $indent.$item->nodeValue."\n \n";
			}
		}
	}
	if(!isset($text)){
		$text = "";
	}
	return $text;
}

/**
* Create wikilist of sources
*
* Useful to turn a few paragraphs into a bulleted list
*
* @param $txt string Text in html
* @return string Text with each paragraph on a new line beggining with *
*/
function getSourcecreditParagraph($txt){

	$txt = str_replace("; Pub.",";</p> <p>Pub.",$txt);
	$doc = new DOMDocument();
	@$doc->loadHTML($txt);
	$xpath = new DOMXPath($doc);

	$query = '//p';
	
	$items = $xpath->query($query);
	foreach($items as $item){
		if(isset($result)){
			if(is_object($item)){
				if(!preg_match("{{page\|[0-9]+}}",$item->nodeValue)){
					$result = $result."\n*".$item->nodeValue;
				}
			}
		}else{
			if(is_object($item)){
				$result = $item->nodeValue;
			}
		}
	}
	$result = "*".preg_replace("/^(\()|(\))$/","",$result);
	return $result;
}

/**
* Extract content from <p> element
*
* @param $txt string Text in html
* @return string Content of <p> element(s)
*/
function getP($txt){
	$doc = new DOMDocument();
	@$doc->loadHTML($txt);
	$xpath = new DOMXPath($doc);

	$query = '//p';
	
	$items = $xpath->query($query);
	foreach($items as $item){
		if(isset($result)){
			if(is_object($item)){
				$result = $result."\n\n".$item->nodeValue;
			}
		}else{
			if(is_object($item)){
				$result = $item->nodeValue;
			}
		}
	}
	if(!isset($result)){
		$result = "";
	}
	return $result;
}

/**
* Extract content from <div class="analysis">
* 
* @param $txt string Text in html
* @result string Content of <div class="analysis"> element
*/
function getAnalysisSection($txt) {
	$doc = new DOMDocument();
	@$doc->loadHTML($txt);
	$xpath = new DOMXPath($doc);

	$query = '//div[@class="analysis"]';
	
	$items = $xpath->query($query);
	foreach($items as $item){
		if(isset($result)){
			$result = $result."".$item->nodeValue;
		}else{
			$result = $item->nodeValue;
		}
	}
	if(isset($result)){
		$result = $result."\n|}";
	}
	
	$query = '//p';
	
	$items = $xpath->query($query);
	foreach($items as $item){
		if(isset($result)){
			$result = $result."\n\n".$item->nodeValue;
		}else{
			$result = $item->nodeValue;
		}
	}
	
	return $result;
}
/**
* Split the expcite html comment into an array of info on each document
* 
* Each document in an FDSys html file includes an html comment
* <!-- expcite: ... --> that includes the name and number of the title, subtitle
* chapter, subchapter, part, and section. This information is seperated by "!@!"
* 
* @param $input string The info found in <!-- expcite: ... -->
* @return array
*  "titleNo" => TITLE NUMBER
*  "titleName" => TITLE FULL NAME
*  "subtitleNo" => SUBTITLE NUMBER
*  "subtitleName" => SUBTITLE FULL NAME
*  "chapNo" => CHAPTER NUMBER
*  "chapName" => CHAPTER FULL NAME
*  "subchapNo" => SUBCHAPTER NUMBER
*  "subchapName" => SUBCHAPTER FULL NAME
*  "partNo" => PART NUMBER
*  "partName" => PART FULL NAME
*  "secNo" => SECTION NUMBER
*  "level" => the lowest level identified in expcite
*/
function expcite($input){
	$cites = explode("!@!",$input);
	foreach($cites as $cite){
		if(preg_match("/^TITLE ([0-9]+)-([A-Za-z\s\-]+)/",$cite,$title)){
			$titleNo = $title[1];
			$titleName = $title[2];
			$level = "title";
		}
		if(preg_match("/S[Uu][Bb][Tt][Ii][Tt][Ll][Ee] ([A-Z]+)-([A-Za-z\s\-]+)/",$cite,$subtitle)){
			$subtitleNo = $subtitle[1];
			$subtitleName = $subtitle[2];
			$level = "subtitle";
		}
		if(preg_match("/^CHAPTER ([0-9A-Z]+)-([A-Za-z\s\-]+)/",$cite,$chap)){
			$chapNo = $chap[1];
			$chapName = $chap[2];
			$level = "chapter";
		}
		if(preg_match("/S[Uu][Bb][Cc][Hh][Aa][Pp][Tt][Ee][Rr] ([A-Z]+)-([A-Za-z\s\-]+)/",$cite,$subchap)){
			$subchapNo = $subchap[1];
			$subchapName = $subchap[2];
			$level = "subchapter";
		}
		if(preg_match("/P[Aa][Rr][Tt] ([A-Z0-9]+)-([A-Za-z\s0-9]+)/",$cite,$part)){
			$partNo = $part[1];
			$partName = $part[2];
			$level = "part";
		}
		if(preg_match("/Sec[s]*. ([0-9A-Za-z,\s\-]+)/",$cite,$sec)){
			$secNo = $sec[1];
			$level = "section";
		}
	}
	if(!isset($titleNo)){
		$titleNo = false;
		$titleName = false;
	}
	if(!isset($subtitleNo)){
		$subtitleNo = false;
		$subtitleName = false;
	}
	if(!isset($chapNo)){
		$chapNo = false;
		$chapName = false;
	}
	if(!isset($subchapNo)){
		$subchapNo = false;
		$subchapName = false;
	}
	if(!isset($partNo)){
		$partNo = false;
		$partName = false;
	}
	if(!isset($secNo)){
		$secNo = false;
	}
	return array ("titleNo" => $titleNo, "titleName" => $titleName, "subtitleNo" => $subtitleNo, "subtitleName" => $subtitleName, "chapNo" => $chapNo, "chapName" => $chapName, "subchapNo" => $subchapNo, "subchapName" => $subchapName, "partNo" => $partNo, "partName" => $partName, "secNo" => $secNo, "level" => $level);
}

/**
* Names the document, including brackets
*
* @param $txt string The info found in <!-- expcite: ... --> 
* @param $year string The year, item [2] from collectPublicationData()
* @return string The name of the document inside double brackets
*/
function getNames($txt,$year) {
	$nameTitle = $nameSubtitle = $nameChapter = $nameSubchapter = $namePart = $nameSec = $display = false;

	if(preg_match("/<!-- expcite:(.*) -->/",$txt,$expcite)){
		$cite = expcite($expcite[1]);
	}else{
		$cite = false;
	}
	if($cite["titleNo"] != false){
		$nameTitle = "/Title ".$cite['titleNo'];
		$display = $nameTitle;
	}
	if($cite["subtitleNo"] != false){
		$nameSubtitle = "/Subtitle ".$cite['subtitleNo'];
		$display = $nameSubtitle;
	}
	if($cite["chapNo"] != false){
		$nameChapter = "/Chapter ".$cite['chapNo'];
		$display = $nameChapter;
	}
	if($cite["subchapNo"] != false){
		$nameSubchapter = "/Subchapter ".$cite['subchapNo'];
		$display = $nameSubchapter;
	}
	if($cite["partNo"] != false){
		$namePart = "/Part ".$cite['partNo'];
		$display = $namePart;
	}
	if($cite["secNo"] != false){
		$nameSec = "/Sec. ".$cite['secNo'];
		$display = $nameSec;
	}
	$display = trim($display,"/");
	
	$name = "[[United States Code (".$year.")".$nameTitle.$nameSubtitle.$nameChapter.$nameSubchapter.$namePart.$nameSec."|".$display."]]";
	return $name;

}

/**
* Comapares the amendment from a current document with a previous
* year's amendment for the same document
*
* Since a new year's publication may include duplicate information, running a comparison
* allows transclusion (such as {{:Content}}) instead of reuploading duplicate info
* @param $name string Name of the section, as built from expcite()
* @param $amendment string Revision determined by determineRevisionKey()
* @param $key string Generated by printRevisionIndex() on the previous year's title
*
* @return true or false
*/
function compareAmendments($name, $revisionThisYear, $indexFromPreviousYear){
	if($indexFromPreviousYear !=  ""){
		foreach($indexFromPreviousYear as $statute => $revisionPrevYear){
			/**
			* seperate out the year from the title... the year goes in $statuteArray[1]
			*/
			preg_match("/United States Code \(([0-9]+)\)(.*[0-9]+$)/",$statute,$statuteArray);
			
			if(isset($statuteArray[2])&&isset($statuteArray[1])){
				if(($statuteArray[2] == $name)&&($revisionThisYear != "")&&($revisionThisYear == $revisionPrevYear)) {
					return $statuteArray[1];
				}
			}
		}
	}
}

if(!isset($RevisionIndex)){
	$RevisionIndex = "";
}

/**
* Format each section of the document.
*
* Prints the full text of the document, including statute, talk page, 
*
* @param $html string Complete html of the title
* @param $data array Generated by collectPublicationData()
* @param $previous string Name of previous section, collected in array from getNames()
* @param $next string Name of next section, collected in array from getNames() 
*
* @return array [0] => NAME, [1] => SIZE
*/
function formatFDSys($html, $data, $previous, $next) {
	global $revisionIndex;
	$revision =  $sectionFootnotes = $docinfo[1] = $docinfo[3] = $docinfo[5] = $docinfo[7] = "";

	$currentyear = $data[2];
	$title = $data[6];
	
	/**
	* A comment in the html includes some useful info in <!-- documentid:... -->,
	* so this is captured in $sectioninfo() and put in {{textinfo-fdsys}} on the
	* section's talk page.
	*/
	
	if(preg_match("/documentid:([0-9a-z_,\-]+)[ ]*(usckey:([0-9a-z]+))*[ ]*(currentthrough:([0-9]+))*[ ]*(documentPDFPage:([0-9]+))*/",$html,$docinfo)){
		if(!isset($docinfo[1])) { $docinfo[1] = "Unspecified"; }
		if(!isset($docinfo[3])) { $docinfo[3] = "Unspecified"; }
		if(!isset($docinfo[5])) { $docinfo[5] = "Unspecified"; }
		if(!isset($docinfo[7])) { $docinfo[7] = "Unspecified"; }
		if($docinfo[1] == "") { $docinfo[1] = "Unspecified"; }
		if($docinfo[3] == "") { $docinfo[3] = "Unspecified"; }
		if($docinfo[5] == "") { $docinfo[5] = "Unspecified"; }
		if($docinfo[7] == "") { $docinfo[7] = "Unspecified"; }
		$sectioninfo['documentid'] = $docinfo[1];
		$sectioninfo['usckey'] = $docinfo[3];
		$sectioninfo['currentthrough'] = $docinfo[5];
		$sectioninfo['PDFPage'] = $docinfo[7];
	}else{
		$sectioninfo['documentid'] = "Unspecified";
		$sectioninfo['usckey'] = "Unspecified";
		$sectioninfo['currentthrough'] = "Unspecified";
		$sectioninfo['PDFPage'] = "Unspecified";
	}
	
	/**
	* Edit HTML
	*/
	
	//wikifyCite() slows things down sometimes, so the next two lines are commented out
	//$x = wikifyCite($html,$formats);
	//$html = str_replace("&lt;","<",$x[0]);
	
	/**
	* Removes extra spacing
	*/
	$html = str_replace("<p class=\"note-body-flush0_hang4\">&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;","<p class=\"note-body-flush0_hang4\">:",$html);
	
	/**
	* Replaces html small caps with Template:Small-caps
	*/
	$html = preg_replace("/<cap-smallcap>/","{{sc|",$html);
	$html = preg_replace("/<\/cap-smallcap>/","}} ",$html);
	
	/**
	* Replaces html foot note links with Template:ref
	*/
	$html = preg_replace("/<a href=\"#[0-9_]+_target\" name=\"([0-9_]+)\">([0-9]+)<\/a>/","{{ref|$1|$2}}",$html);
	$html = preg_replace("/<a href=\"#[0-9_]+\" name=\"([0-9_]+)_target\"><sup>([0-9]+)<\/sup>(.*)<\/a>/","{{note|$1|$2}} $3</a>",$html);
	
	/**
	* Build wiki tables
	*/
	$html = preg_replace("/<div class=\"analysis-head-left\">(.*)<\/div>/","|-\n! $1 \n!",$html);
	$html = preg_replace("/<h4 class=\"analysis-subhead\">(.*)<\/h4>/","<div class=\"analysis\">\n|-\n! colspan=2 | $1</div>",$html);
	$html = preg_replace("/<div class=\"two-column-analysis-style-content-left\">(.+)\.<\/div><div class=\"two-column-analysis-style-content-right\">(.*)(<sup>.*<\/sup>)*<\/div>/","|- \n| [[$data[6] U.S.C. &sect; $1 ($currentyear)|$1]]\n| $2 $3",$html);
	$html = preg_replace("/<div class=\"two-column-analysis-style-content-left\" id=\"wide\">(.*)<\/div><\/div>
	/","|- \n| colspan=2 | $1\n",$html);
	$html = preg_replace("/<div class=\"two-column-analysis-style-content-left\">(.*)<\/div><div class=\"two-column-analysis-style-content-right\" id=\"wide\">(.*)(<sup>.*<\/sup>)*<\/div>/","|- \n| colspan=2 | $1  $2",$html);
	
	/**
	* Title Table
	*/
	$html = preg_replace("/<div class=\"three-column-analysis-style-content-left\">([0-9A-Za-z–]+)\.<\/div><div class=\"three-column-analysis-style-content-center\">(.*)[&nbsp;]*(<sup>.*<\/sup>)*<\/div><div class=\"three-column-analysis-style-content-right\">(.*)<\/div>/","|- \n| [[United States Code ($data[2])/Title $data[6]/Chapter $1|$1]]\n| $2 &sect; $3 $4 $5",$html);
	$html = preg_replace("/<div><div class=\"three-column-analysis-style-content-left\">([0-9A-Za-z–]+)\.<\/div><div class=\"three-column-analysis-style-content-center\" id=\"wide\">(.*)<\/div><div class=\"three-column-analysis-style-content-right\">(.*)<\/div><\/div>/","|- \n| [[United States Code ($data[2])/Title $title/Chapter $1|$1]]\n| $2 &sect; $3",$html);
	
	/**
	* Chapter Table
	*/
	$html = preg_replace("/<table class=\"uscdispo2col\" cellspacing=\"0\" width=\"700\"><caption>(.*)<\/caption>
	/","<p>\n{| class=wikitable \n|- \n| colspan=2 | $1\n",$html);
	$html = preg_replace("/<tr><th id=\"row0col0\">(.*)<\/th><th id=\"row0col1\">(.*)<\/th><\/tr>/","|- \n! $1\n! $2",$html);
	$html = preg_replace("#<tr>\n<td class=\"left\">(.*)<\/td>\n<td class=\"right\">(.*)<\/td>\n<\/tr>#","|- \n| $1\n| $2",$html);
	$html = str_replace("</table>","|}\n</p>",$html);
	$html = preg_replace("/<p class=\"intabledata\">(.*)<p\/>/","'''$1'''",$html);
	
	/**
	* Replace html comment with PDFPage to Template:page
	*/
	//$html = preg_replace("/<!-- PDFPage:([0-9]+) -->/","<p>{{page|$1}}</p>",$html);
	
	/**
	* run expcite() when the expcite comment is found
	*/
	if(preg_match("/<!-- expcite:(.*) -->/",$html,$expcite)){
		$cite = expcite($expcite[1]);
	}
	
	$html = str_replace("<br />"," ",$html);
	$split = preg_split("/<!-- field-start:([A-Za-z\-]+) -->/",$html,0,PREG_SPLIT_DELIM_CAPTURE);
	
	foreach($split as $k => $v){
		if($k == 0){
			$txt[] = array("intro", $v);
		}elseif(is_odd($k)){
			$txt[] = array($v, trim($split[$k+1]));
	
		}
	}
	
	/**
	* Build footnotes section
	*/
	if(isset($footnotes)){
		foreach($footnotes as $note){
			if(isset($sectionFootnotes)){
					$sectionFootnotes = $sectionFootnotes."\n".$note;
				}else{
					$sectionFootnotes = $note;
				}
		}
	}
	if(isset($sectionFootnotes)){
		$sectionFootnotes = "\n\n===Foot notes===\n\n".$sectionFootnotes;
	}
	
	/**
	* Build name
	*/
	
	if(!isset($nameTitle)){ $nameTitle = ""; }
	if(!isset($nameSubtitle)){ $nameSubtitle = ""; }
	if(!isset($nameChapter)){ $nameChapter = ""; }
	if(!isset($nameSubchapter)){ $nameSubchapter = ""; }
	if(!isset($namePart)){ $namePart = ""; }
	if(!isset($nameSec)){ $nameSec = ""; }
	
	if($cite["titleNo"] != false){
		$nameTitle = "/Title ".$cite['titleNo'];
		$display = $nameTitle;
	}else{
		$nameTitle = "";
	}
	if($cite["subtitleNo"] != false){
		$nameTitle = "/Subtitle ".$cite['subtitleNo'];
		if(isset($nameSubtitle)){
			$display = $nameSubtitle;
		}
	}else{
		$nameSubtitle = "";
	}
	if($cite["chapNo"] != false){
		$nameChapter = "/Chapter ".$cite['chapNo'];
		$display = $nameChapter;
	}else{
		$nameChapter = "";
	}
	if($cite["subchapNo"] != false){
		$nameSubchapter = "/Subchapter ".$cite['subchapNo'];
		$display = $nameSubchapter;
	}else{
		$nameSubchapter = "";
	}
	if($cite["partNo"] != false){
		$namePart = "/Part ".$cite['partNo'];
		$display = $namePart;
	}else{
		$namePart = "";
	}
	if($cite["secNo"] != false){
		$nameSec = "/Sec. ".$cite['secNo'];
		$display = $nameSec;
	}else{
		$nameSec = "";
	}
	
	
	
	$name = $nameTitle.$nameSubtitle.$nameChapter.$nameSubchapter.$namePart.$nameSec;
	
	/**
	* print everything out
	*/
	
	/**
	* the statute's page
	*/
	
	foreach($txt as $section) {
		if($section[0] == "sourcecredit"){
			$revision = determineRevisionKey($section[1]);
		}
	}
	
	/**
	* Build $fulltext, the body of the section
	*/
	
	if(!isset($fulltext)){
		$fulltext = "";
	}
	
	$prevYearCheck = compareAmendments($name, $revision, $revisionIndex);
	if(preg_match("/[0-9]+/",$prevYearCheck,$prevYearFound)){
		/**
		* if content already exists, transclude it rather than uploading it
		*/
		print "{{-start-}} \n'''United States Code (".$currentyear.")".$name."''' \n\n{{:United States Code (".$prevYearFound[0].")".$name."}} \n\n{{-end-}}\n";
	}else{
		
		foreach($txt as $section){
			$sectionlist[] = $section[0]."<br/>";
			
			/**
			* go through each section and get headers and text
			*/
			
			switch($section[0]){
				case "head<br/>":
					$fulltext = $fulltext . getFieldHeadThree($section[1]);
					break;
				
				case "statute":
					$fulltext = $fulltext .  indentStatute($section[1]);
					break;
				
				case "analysis":
					$fulltext = $fulltext .  "\n{| class=wikitable".getAnalysisSection($section[1]);
					break;
				
				case "sourcecredit":
					$revision = determineRevisionKey($section[1]);
					$fulltext = $fulltext .  "\n\n===Source(s)===\n\n".getSourcecreditParagraph($section[1]);
					break;
					
				case "historicalandrevision-note":
					$fulltext = $fulltext .  "\n\n===Historical and revision notes===\n\n".getP($section[1]);
					break;
				
				case "miscellaneous-note":
					$fulltext = $fulltext .  "\n\n===Miscellaneous notes===\n\n".getP($section[1]);
					break;
					
				case "amendment-note":
					$fulltext = $fulltext .  "\n\n===Amendment notes===\n\n".getP($section[1]);
					break;
					
				case "shorttitle-amendment-note":
					$fulltext = $fulltext .  "\n\n===Short title amendment notes===\n\n".getP($section[1]);
					break;
					
				case "effectivedate-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
					
				case "savingsprovision-note":
					$fulltext = $fulltext .  "\n\n===Savings provision===\n\n".getP($section[1]);
					break;
				
				case "titlehead":
					$fulltext = $fulltext .  getFieldHeadThree($section[1]) .getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case  "structuralhead":
					$fulltext = $fulltext .  getFieldHeadThree($section[1]) .getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "executivedate-amendment-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
					
				case "shorttitle-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
					
				case "codification-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "changeofname-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "priorprovisions-note":
					$footnotes[] = getFieldHeadFour($section[1]).getP($section[1]);
					break;
					
				case "referenceintext-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "function-transfer-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "repealedhead":
					$fulltext = $fulltext .  getFieldHeadThree($section[1]) .getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "repealsummary":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
					
				case "effectivedate-repeal-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "futureamendment-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "terminationdate-amendment-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
					
				case "effectivedate-termination-amendment-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "omittedhead":
					$fulltext = $fulltext .  getFieldHeadThree($section[1]) . getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "effectivedate-terminationdate-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "terminationdate-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "function-delegation-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "repeal-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "construction-amendment-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "construction-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "function-abolition-construction":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "titleenactmentcredit":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
				
				case "terminationdate-repeal-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
					
				case "function-transfer-repeal-savingsclause-similarprovisions-note":
					$fulltext = $fulltext .  getFieldHeadFour($section[1]).getP($section[1]);
					break;
			}
		}
		
		/**
		* add start marker, title, and {{USC-header}}
		*/
		print "{{-start-}} \n'''United States Code (".$currentyear.")".$name."''' \n&lt;noinclude>\n{{USC-header | cite = ".$cite['titleNo']." USC Sec. ".$cite['secNo']." | year = ".$data[2]." | title = ".$cite['titleNo']." | title-name = ".$cite['titleName']." | subtitle = ".$cite['subtitleNo']." | subtitle-name = ".$cite['subtitleName']."  | chapter = ".$cite['chapNo']." | chapter-name = ".$cite['chapName']." | subchapter = ".$cite['subchapNo']." | subchapter-name = ".$cite['subchapName']." | part = ".$cite['partNo']." | part-name = ".$cite['partName']." | section = ".$cite['secNo']." | level = ".$cite['level']." | previous = ".$previous." | next = ".$next." | revision = ".$revision." | current =  | notes =  }}\n&lt;/noinclude>\n\n";
		
		$fulltext = $fulltext . $sectionFootnotes;
		
		/** 
		* add a Last-amended category, for checking a section against a previous year
		*/
		if(isset($revision)) {
			$fulltext = $fulltext . "\n[[Category:Last-amended ".$revision."]]";
		}else{
			$fulltext = $fulltext . "[[Category:Last-amendment unknown]]";
		}
		$fulltext = $fulltext . "\n{{-stop-}}\n";
		
	}
	
	/**
	* statute's talk page
	*/
	
	$fulltext = $fulltext .   "\n{{-start-}} \n'''Talk:United States Code (".$currentyear.")".$name."''' \n\n{{textinfo-fdsys | Authorities-Publication-Name = ".$data[0]." | Authorities-Publication-ID = ".$data[1]." | Authorities-Pulication-Year = ".$data[2]." | Authorities-Laws-Enacted-Through-Date = ".$data[3]." | Searchable-Laws-Enacted-Through-Date = ".$data[4]." | Authorities-USC-Title-Name = ".$data[5]." | Authorities-USC-Title-Enum = ".$data[6]." | Authorities-USC-Title-Status = ".$data[7]." | Conversion-Program = ".$data[8]." | Conversion-Datetime = ".$data[9]."| documentid = ".$sectioninfo['documentid']." | usckey = ".$sectioninfo['usckey']." | currentthrough = ".$sectioninfo['currentthrough']." | PDFPage = ".$sectioninfo['PDFPage']." }} \n\n{{-stop-}} \n";
	
	
	/**
	* create citation redirect, for convenient linking.
	*/
	
	$nameTitle = $nameSubtitle = $nameChapter = $nameSubchapter = $namePart = $nameSec = $display = false;
	if($nameSec != false) {
		$fulltext = $fulltext."\n{{-start-}}\n'''".$title." U.S.C. &sect; ".$cite['secNo']." (".$currentyear.")'''\n\n\n#REDIRECT [[United States Code (".$currentyear.")".$nameTitle.$nameChapter.$nameSubchapter.$nameSec."]]\n\n\n{{-stop-}}\n"; 
	}
	
	$sectionSize = strlen($fulltext);
	
	print $fulltext;
	
	/**
	* generate master list of sections
	*/
	global $masterList;
	$masterList["United States Code (".$data[2].")".$name] = $revision;
	
	return $sectionSize;
}

print "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />";

$us = explode("|",$_GET["u"]);

?>
<html>
<head>
<title>Format Title of U.S. Code</title>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<style>
fieldset {
width:70em;
}
</style>
</head>
<fieldset>
<legend>Enter URL of HTML file to format</legend>
<form method="get">
<label for="u">URL</label>
<input type="text" name="u" value="<?php if(isset($_GET['u'])) { print $_GET['u']; } ?>"><br />
<button type="submit">Submit</button><br />
</form></fieldset>
<fieldset>
<legend>Create bot file</legend>

<?php

/**
* assemble the full code
*/

foreach($us as $u){
	$targetURL = $u;
	$userAgent = '';

	/**
	* Open up each file specified in $us. Multiple files are seperated by |
	*/
	$ch = curl_init();
	curl_setopt($ch, CURLOPT_USERAGENT, $userAgent);
	curl_setopt($ch, CURLOPT_URL,$targetURL);
	curl_setopt($ch, CURLOPT_FAILONERROR, true);
	curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
	curl_setopt($ch, CURLOPT_AUTOREFERER, true);
	curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
	curl_setopt($ch, CURLOPT_TIMEOUT, 100);
	$html= curl_exec($ch);
	
	$sections = preg_split("/(<!-- documentid:[0-9]+_[\-ch0-9]+)/",$html,-1,PREG_SPLIT_DELIM_CAPTURE);
	
	$data = collectPublicationData($html);
	print "<ul> \n<li>AUTHORITIES-PUBLICATION-NAME: ".$data[0]."</li> \n<li>AUTHORITIES-PUBLICATION-ID: ".$data[1]."</li> \n<li>AUTHORITIES-PUBLICATION-YEAR: ".$data[2]."</li> \n<li>AUTHORITIES-LAWS-ENACTED-THROUGH-DATE: ".$data[3]."</li> \n<li>SEARCHABLE-LAWS-ENACTED-THROUGH-DATE: ".$data[4]."</li> \n<li>AUTHORITIES-USC-TITLE-NAME: ".$data[5]."</li> \n<li>AUTHORITIES-USC-TITLE-ENUM: ".$data[6]."</li> \n<li>AUTHORITIES-USC-TITLE-STATUS: ".$data[7]."</li> \n<li>CONVERSION-PROGRAM: ".$data[8]."</li> \n<li>CONVERSION-DATETIME: ".$data[9]."</li> \n</ul>";
	
	/**
	* get a list of names, for the "Next" and "Previous" sections
	*/
	foreach($sections as $section){
		$names[] = getNames($section,$data[2]);
	}
	$sizeTotal = 0;
	
	/**
	* print each section via formatFDSys()
	*/
	print "<textarea cols=100 rows=30>";
	foreach($sections as $key=>$section){
			if(is_odd($key)){
				/**
				* if it is the last section in the title, there is no "next"
				*/
				if(isset($names[$key+3])){
					$sectionSize = formatFDSys($section.$sections[$key+1],$data,$names[$key-1],$names[$key+3]);
					unset($sections[$key]);
					$sizeTotal = $sizeTotal + $sectionSize;
				}else{
					$sectionSize = formatFDSys($section.$sections[$key+1],$data,$names[$key-1],"");
					unset($sections[$key]);
					$sizeTotal = $sizeTotal + $sectionSize;
				}
					
				/**
				* Seperate each section into managable chunks of text
				*/
				if($sizeTotal >= $splitSize){
					print "</textarea><br/> \nSplit at: ".$sizeTotal."b <br/> \n<textarea cols=100 rows=30>";
					$sizeTotal = 0;
				}
			}
		if($key == 0){
			$name[] = "United States Code (".$data[2].")".formatFDSys($section.$sections[$key], $data, "", $names[$key+3]);
			unset($sections[$key]);
		}
	}
	print "</textarea>";
	
	if (!$html) {
		echo "\ncURL error number:" .curl_errno($ch)."\ncURL error:" . curl_error($ch);
		exit;
	}
	
	/**
	* Add the year to the $masterList
	*/
	
	print "<h2>Revision key for Title ".$data[5].", ".$data[2]."</h2> \n<textarea name='revisionkey' cols=100 rows=30>\n".RevisionIndex($masterList)." \n</textarea>";

}

?>
</fieldset>
</html>
Version history

Code

See also