Edit source Talk
Redirected from Appropedia:Porting HTML to wiki

Appropedia:Porting formatted content to MediaWiki

From Appropedia

Suggestion: Merge with the manual on Mediawiki.org

Formatted text, or "rich text," including HTML, Word or OpenOffice documents, can be converted to MediaWiki markup, with formatting.

There are a few techniques which are now being trialed. It may be that wikEd is better for HTML files and OpenOffice is better for Word/rtf files (and Word/rtf files converted from PDF).[verification needed]

If you are starting with a PDF document, it must first be converted to formatted text in another application, before it can be converted to MediaWiki: see Help:Porting PDF files to MediaWiki.

Pandoc[edit source]

Pandoc is a markup converter that accepts many different formats, including Mediawki. Check this example of how to convert from Markdown. This is probably the easiest method to get any type of content into MediaWiki markup and have it published on Appropedia.

Using R scripts[edit source]

Google Spreadsheet into wikitable[edit source]

require(tidyverse)
require(googlesheets)
require(lubridate)

working_sheet <- gs_url("")
gs_download(from=working_sheet,
            ws=1,
            to = "appropedia_mentions.csv",
            overwrite = T,
            encoding = "UTF-8",
            verbose = TRUE
)
mentions <- tibble(read.csv("appropedia_mentions.csv", encoding = "UTF-8"))
mentions$Date <- strptime(mentions$Date, format="%m/%e/%Y")

header = paste0("{| class=\"wikitable\"\n ! Topic !! Date !! Author !! Language\n|-")
footer = "|}"
writer <- function(t){
  text <- paste0(
    "| [", t$URL, " ", t$Topic, "]\n",
    "| ", t$Date, "\n",
    "| ", t$Autor, "\n",
    "| ", t$idioma, "\n|-\n"
  )
  return(text)
}

write(header, file="mentions.txt", append=TRUE)
write(writer(mentions), file="mentions.txt", append=TRUE)
write(footer, file="mentions.txt", append=TRUE)

Google Scholar searches to wikitable[edit source]

This generates a wikitable based on Google Scholar, for example: Appropedia:List_of_academic_citations.

require(rvest, quietly=T)
require(dplyr, quietly=T)

index <- seq(from=81, to=90)
# Page number from which results are drawn. This is done in parts to avoid getting error 429.

# https://stackoverflow.com/questions/43461907/in-r-how-do-i-combine-two-xml-documents-into-one-document
xml0 <- read_xml("<html></html>")
for(i in index){
  url <- paste0("https://scholar.google.com/scholar?start=", 10*i, "&q=appropedia")
  result <- read_html(url)
  
  result_children <- xml_children(result)
  for(child in result_children){
    xml_add_child(xml0, child)
  }
  Sys.sleep(10)
}

papers <- xml0 %>% html_nodes(".gs_r.gs_or.gs_scl")

articles_list <- list()

for(p in 1:length(papers)){
  articles_list[[p]] <- c(
    title = papers[p] %>% html_nodes("h3.gs_rt") %>% html_text(),
    id = papers[p] %>% html_nodes("h3.gs_rt") %>% html_nodes(xpath="./span | ./a") %>% html_attr("id") %>% paste0(collapse=""),
    author = papers[p] %>% html_nodes(".gs_a") %>% html_text(),
    url = papers[p] %>% html_nodes(".gs_or_ggsm") %>% html_nodes("a") %>% html_attr("href"),
    abstract = papers[p] %>% html_nodes(".gs_rs") %>% html_text()
  )
}

articles_df <- do.call(bind_rows, articles_list) %>% as_tibble()

# Extracting year
articles_df <- articles_df %>% mutate(year=str_extract(author, "\\d{4}"))

# List of authors
# articles_df <- articles_df %>% mutate(authors = str_extract(author, "[\\w ]+[, ?][\\w+\\s][\\w+,\\s]*(?= -)"))
articles_df <- articles_df %>% mutate(authors = str_extract(author, "\\w+[, ][^\\d+][^-]+(?=- )|(\\w ?)+"))
  
# Clean the title
articles_df <- articles_df %>% mutate(clean_title= str_extract(title, "(?<=\\w\\] )[^\\[].*|^[^\\[].*"))

# Google Scholar get citation
# https://scholar.google.com.sv/scholar?q=info:43lfyFl0WdUJ:scholar.google.com/&output=cite&scirp=10&hl=en

articles_df$authors <- articles_df$authors %>% str_trim()

articles_to_print <- bind_cols(n=800+1:nrow(articles_df), articles_df)
# Note that the number 800 is added to keep track of the index at the beginning of the script.
docmaker <- function(t){
  final.text <- paste0(
    "# ", t$n, ". ", t$clean_title, "\n",
    "- authors: ", t$authors, "\n",
    "- year: ", t$year,  "\n",
    "- url: ", t$url,  "\n"
  )
  write(final.text, file="result_appropedia_2.txt", append=TRUE)
}

docmaker(articles_to_print)