diff options
Diffstat (limited to 'hugo/libraries/plugins/import/ImportMediawiki.class.php')
| -rw-r--r-- | hugo/libraries/plugins/import/ImportMediawiki.class.php | 573 |
1 files changed, 573 insertions, 0 deletions
diff --git a/hugo/libraries/plugins/import/ImportMediawiki.class.php b/hugo/libraries/plugins/import/ImportMediawiki.class.php new file mode 100644 index 0000000..767c966 --- /dev/null +++ b/hugo/libraries/plugins/import/ImportMediawiki.class.php @@ -0,0 +1,573 @@ +<?php +/* vim: set expandtab sw=4 ts=4 sts=4: */ +/** + * MediaWiki import plugin for phpMyAdmin + * + * @package PhpMyAdmin-Import + * @subpackage MediaWiki + */ +if (! defined('PHPMYADMIN')) { + exit; +} + +/* Get the import interface */ +require_once 'libraries/plugins/ImportPlugin.class.php'; + +/** + * Handles the import for the MediaWiki format + * + * @package PhpMyAdmin-Import + * @subpackage MediaWiki + */ +class ImportMediawiki extends ImportPlugin +{ + /** + * Whether to analyze tables + * + * @var bool + */ + private $_analyze; + + /** + * Constructor + */ + public function __construct() + { + $this->setProperties(); + } + + /** + * Sets the import plugin properties. + * Called in the constructor. + * + * @return void + */ + protected function setProperties() + { + $this->_setAnalyze(false); + if ($GLOBALS['plugin_param'] !== 'table') { + $this->_setAnalyze(true); + } + + $props = 'libraries/properties/'; + include_once "$props/plugins/ImportPluginProperties.class.php"; + + $importPluginProperties = new ImportPluginProperties(); + $importPluginProperties->setText(__('MediaWiki Table')); + $importPluginProperties->setExtension('txt'); + $importPluginProperties->setMimeType('text/plain'); + $importPluginProperties->setOptions(array()); + $importPluginProperties->setOptionsText(__('Options')); + + $this->properties = $importPluginProperties; + } + + /** + * This method is called when any PluginManager to which the observer + * is attached calls PluginManager::notify() + * + * @param SplSubject $subject The PluginManager notifying the observer + * of an update. + * + * @return void + */ + public function update (SplSubject $subject) + { + } + + /** + * Handles the whole import logic + * + * @return void + */ + public function doImport() + { + global $error, $timeout_passed, $finished; + + // Defaults for parser + + // The buffer that will be used to store chunks read from the imported file + $buffer = ''; + + // Used as storage for the last part of the current chunk data + // Will be appended to the first line of the next chunk, if there is one + $last_chunk_line = ''; + + // Remembers whether the current buffer line is part of a comment + $inside_comment = false; + // Remembers whether the current buffer line is part of a data comment + $inside_data_comment = false; + // Remembers whether the current buffer line is part of a structure comment + $inside_structure_comment = false; + + // MediaWiki only accepts "\n" as row terminator + $mediawiki_new_line = "\n"; + + // Initialize the name of the current table + $cur_table_name = ""; + + while (! $finished && ! $error && ! $timeout_passed ) { + $data = PMA_importGetNextChunk(); + + if ($data === false) { + // Subtract data we didn't handle yet and stop processing + $offset -= strlen($buffer); + break; + } elseif ($data === true) { + // Handle rest of buffer + } else { + // Append new data to buffer + $buffer = $data; + unset($data); + // Don't parse string if we're not at the end + // and don't have a new line inside + if ( strpos($buffer, $mediawiki_new_line) === false ) { + continue; + } + } + + // Because of reading chunk by chunk, the first line from the buffer + // contains only a portion of an actual line from the imported file. + // Therefore, we have to append it to the last line from the previous + // chunk. If we are at the first chunk, $last_chunk_line should be empty. + $buffer = $last_chunk_line . $buffer; + + // Process the buffer line by line + $buffer_lines = explode($mediawiki_new_line, $buffer); + + $full_buffer_lines_count = count($buffer_lines); + // If the reading is not finalised, the final line of the current chunk + // will not be complete + if (! $finished) { + $full_buffer_lines_count -= 1; + $last_chunk_line = $buffer_lines[$full_buffer_lines_count]; + } + + for ($line_nr = 0; $line_nr < $full_buffer_lines_count; ++ $line_nr) { + $cur_buffer_line = trim($buffer_lines[$line_nr]); + + // If the line is empty, go to the next one + if ( $cur_buffer_line === '' ) { + continue; + } + + $first_character = $cur_buffer_line[0]; + $matches = array(); + + // Check beginnning of comment + if (! strcmp(substr($cur_buffer_line, 0, 4), "<!--")) { + $inside_comment = true; + continue; + } elseif ($inside_comment) { + // Check end of comment + if (! strcmp(substr($cur_buffer_line, 0, 4), "-->")) { + // Only data comments are closed. The structure comments + // will be closed when a data comment begins (in order to + // skip structure tables) + if ($inside_data_comment) { + $inside_data_comment = false; + } + + // End comments that are not related to table structure + if (! $inside_structure_comment) { + $inside_comment = false; + } + } else { + // Check table name + $match_table_name = array(); + if (preg_match( + "/^Table data for `(.*)`$/", + $cur_buffer_line, + $match_table_name + ) + ) { + $cur_table_name = $match_table_name[1]; + $inside_data_comment = true; + + // End ignoring structure rows + if ($inside_structure_comment) { + $inside_structure_comment = false; + } + } elseif (preg_match( + "/^Table structure for `(.*)`$/", + $cur_buffer_line, + $match_table_name + ) + ) { + // The structure comments will be ignored + $inside_structure_comment = true; + } + } + continue; + } elseif (preg_match('/^\{\|(.*)$/', $cur_buffer_line, $matches)) { + // Check start of table + + // This will store all the column info on all rows from + // the current table read from the buffer + $cur_temp_table = array(); + + // Will be used as storage for the current row in the buffer + // Once all its columns are read, it will be added to + // $cur_temp_table and then it will be emptied + $cur_temp_line = array(); + + // Helps us differentiate the header columns + // from the normal columns + $in_table_header = false; + // End processing because the current line does not + // contain any column information + } elseif (substr($cur_buffer_line, 0, 2) === '|-' + || substr($cur_buffer_line, 0, 2) === '|+' + || substr($cur_buffer_line, 0, 2) === '|}' + ) { + // Check begin row or end table + + // Add current line to the values storage + if (! empty($cur_temp_line)) { + // If the current line contains header cells + // ( marked with '!' ), + // it will be marked as table header + if ( $in_table_header ) { + // Set the header columns + $cur_temp_table_headers = $cur_temp_line; + } else { + // Normal line, add it to the table + $cur_temp_table [] = $cur_temp_line; + } + } + + // Empty the temporary buffer + $cur_temp_line = array(); + + // No more processing required at the end of the table + if (substr($cur_buffer_line, 0, 2) === '|}') { + $current_table = array( + $cur_table_name, + $cur_temp_table_headers, + $cur_temp_table + ); + + // Import the current table data into the database + $this->_importDataOneTable($current_table); + + // Reset table name + $cur_table_name = ""; + } + // What's after the row tag is now only attributes + + } elseif (($first_character === '|') || ($first_character === '!')) { + // Check cell elements + + // Header cells + if ($first_character === '!') { + // Mark as table header, but treat as normal row + $cur_buffer_line = str_replace('!!', '||', $cur_buffer_line); + // Will be used to set $cur_temp_line as table header + $in_table_header = true; + } else { + $in_table_header = false; + } + + // Loop through each table cell + $cells = $this->_explodeMarkup($cur_buffer_line); + foreach ($cells as $cell) { + // A cell could contain both parameters and data + $cell_data = explode('|', $cell, 2); + + // A '|' inside an invalid link should not + // be mistaken as delimiting cell parameters + if (strpos($cell_data[0], '[[') === true ) { + if (count($cell_data) == 1) { + $cell = $cell_data[0]; + } else { + $cell = $cell_data[1]; + } + } + + // Delete the beginning of the column, if there is one + $cell = trim($cell); + $col_start_chars = array( "|", "!"); + foreach ($col_start_chars as $col_start_char) { + if (strpos($cell, $col_start_char) === 0) { + $cell = trim(substr($cell, 1)); + } + } + + // Add the cell to the row + $cur_temp_line [] = $cell; + } // foreach $cells + } else { + // If it's none of the above, then the current line has a bad + // format + $message = PMA_Message::error( + __('Invalid format of mediawiki input on line: <br />%s.') + ); + $message->addParam($cur_buffer_line); + $error = true; + } + } // End treating full buffer lines + } // while - finished parsing buffer + } + + /** + * Imports data from a single table + * + * @param array $table containing all table info: + * <code> + * $table[0] - string containing table name + * $table[1] - array[] of table headers + * $table[2] - array[][] of table content rows + * </code> + * + * @global bool $analyze whether to scan for column types + * + * @return void + */ + private function _importDataOneTable ($table) + { + $analyze = $this->_getAnalyze(); + if ($analyze) { + // Set the table name + $this->_setTableName($table[0]); + + // Set generic names for table headers if they don't exist + $this->_setTableHeaders($table[1], $table[2][0]); + + // Create the tables array to be used in PMA_buildSQL() + $tables = array(); + $tables [] = array($table[0], $table[1], $table[2]); + + // Obtain the best-fit MySQL types for each column + $analyses = array(); + $analyses [] = PMA_analyzeTable($tables[0]); + + $this->_executeImportTables($tables, $analyses); + } + + // Commit any possible data in buffers + PMA_importRunQuery(); + } + + /** + * Sets the table name + * + * @param string &$table_name reference to the name of the table + * + * @return void + */ + private function _setTableName(&$table_name) + { + if (empty($table_name)) { + $result = PMA_DBI_fetch_result('SHOW TABLES'); + // todo check if the name below already exists + $table_name = 'TABLE '.(count($result) + 1); + } + } + + /** + * Set generic names for table headers, if they don't exist + * + * @param array &$table_headers reference to the array containing the headers + * of a table + * @param array $table_row array containing the first content row + * + * @return void + */ + private function _setTableHeaders(&$table_headers, $table_row) + { + if (empty($table_headers)) { + // The first table row should contain the number of columns + // If they are not set, generic names will be given (COL 1, COL 2, etc) + $num_cols = count($table_row); + for ($i = 0; $i < $num_cols; ++ $i) { + $table_headers [$i] = 'COL '. ($i + 1); + } + } + } + + /** + * Sets the database name and additional options and calls PMA_buildSQL() + * Used in PMA_importDataAllTables() and $this->_importDataOneTable() + * + * @param array &$tables structure: + * array( + * array(table_name, array() column_names, array()() rows) + * ) + * @param array &$analyses structure: + * $analyses = array( + * array(array() column_types, array() column_sizes) + * ) + * + * @global string $db name of the database to import in + * + * @return void + */ + private function _executeImportTables(&$tables, &$analyses) + { + global $db; + + // $db_name : The currently selected database name, if applicable + // No backquotes + // $options : An associative array of options + if (strlen($db)) { + $db_name = $db; + $options = array('create_db' => false); + } else { + $db_name = 'mediawiki_DB'; + $options = null; + } + + // Array of SQL strings + // Non-applicable parameters + $create = null; + + // Create and execute necessary SQL statements from data + PMA_buildSQL($db_name, $tables, $analyses, $create, $options); + + unset($tables); + unset($analyses); + } + + + /** + * Replaces all instances of the '||' separator between delimiters + * in a given string + * + * @param string $start_delim start delimiter + * @param string $end_delim end delimiter + * @param string $replace the string to be replaced with + * @param string $subject the text to be replaced + * + * @return string with replacements + */ + private function _delimiterReplace($start_delim, $end_delim, $replace, $subject) + { + // String that will be returned + $cleaned = ""; + // Possible states of current character + $inside_tag = false; + $inside_attribute = false; + // Attributes can be declared with either " or ' + $start_attribute_character = false; + + // The full separator is "||"; + // This rembembers if the previous character was '|' + $partial_separator = false; + + // Parse text char by char + for ($i = 0; $i < strlen($subject); $i ++) { + $cur_char = $subject[$i]; + // Check for separators + if ($cur_char == '|') { + // If we're not inside a tag, then this is part of a real separator, + // so we append it to the current segment + if (! $inside_attribute) { + $cleaned .= $cur_char; + if ($partial_separator) { + $inside_tag = false; + $inside_attribute = false; + } + } elseif ($partial_separator) { + // If we are inside a tag, we replace the current char with + // the placeholder and append that to the current segment + $cleaned .= $replace; + } + + // If the previous character was also '|', then this ends a + // full separator. If not, this may be the beginning of one + $partial_separator = ! $partial_separator; + } else { + // If we're inside a tag attribute and the current character is + // not '|', but the previous one was, it means that the single '|' + // was not appended, so we append it now + if ($partial_separator && $inside_attribute) { + $cleaned .= "|"; + } + // If the char is different from "|", no separator can be formed + $partial_separator = false; + + // any other character should be appended to the current segment + $cleaned .= $cur_char; + + if ($cur_char == '<' && ! $inside_attribute) { + // start of a tag + $inside_tag = true; + } elseif ($cur_char == '>' && ! $inside_attribute) { + // end of a tag + $inside_tag = false; + } elseif (($cur_char == '"' || $cur_char == "'") && $inside_tag) { + // start or end of an attribute + if (! $inside_attribute) { + $inside_attribute = true; + // remember the attribute`s declaration character (" or ') + $start_attribute_character = $cur_char; + } else { + if ($cur_char == $start_attribute_character) { + $inside_attribute = false; + // unset attribute declaration character + $start_attribute_character = false; + } + } + } + } + } // end for each character in $subject + + return $cleaned; + } + + /** + * Separates a string into items, similarly to explode + * Uses the '||' separator (which is standard in the mediawiki format) + * and ignores any instances of it inside markup tags + * Used in parsing buffer lines containing data cells + * + * @param string $text text to be split + * + * @return array + */ + private function _explodeMarkup($text) + { + $separator = "||"; + $placeholder = "\x00"; + + // Remove placeholder instances + $text = str_replace($placeholder, '', $text); + + // Replace instances of the separator inside HTML-like + // tags with the placeholder + $cleaned = $this->_delimiterReplace("<", ">", $placeholder, $text); + // Explode, then put the replaced separators back in + $items = explode($separator, $cleaned); + foreach ($items as $i => $str) { + $items[$i] = str_replace($placeholder, $separator, $str); + } + + return $items; + } + + + /* ~~~~~~~~~~~~~~~~~~~~ Getters and Setters ~~~~~~~~~~~~~~~~~~~~ */ + + + /** + * Returns true if the table should be analyzed, false otherwise + * + * @return bool + */ + private function _getAnalyze() + { + return $this->_analyze; + } + + /** + * Sets to true if the table should be analyzed, false otherwise + * + * @param bool $analyze status + * + * @return void + */ + private function _setAnalyze($analyze) + { + $this->_analyze = $analyze; + } +}
\ No newline at end of file |
