<?xml version='1.0' encoding='UTF-8'?><?xml-stylesheet href='static/style.xsl' type='text/xsl'?><OAI-PMH xmlns="http://www.openarchives.org/OAI/2.0/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd"><responseDate>2026-04-10T10:32:54Z</responseDate><request verb="GetRecord" identifier="oai:clarin.eurac.edu:20.500.12124/8" metadataPrefix="oai_dc">http://clarin.eurac.edu/repository/oai/request</request><GetRecord><record><header><identifier>oai:clarin.eurac.edu:20.500.12124/8</identifier><datestamp>2023-03-17T15:51:45Z</datestamp><setSpec>hdl_20.500.12124_35</setSpec><setSpec>hdl_20.500.12124_2</setSpec></header><metadata><oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:doc="http://www.lyncode.com/xoai" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd">
<dc:title>KrdWrd CANOLA Corpus 1.0</dc:title>
<dc:creator>Stemle, Egon W.</dc:creator>
<dc:creator>Steger, Johannes M.</dc:creator>
<dc:subject>boiler plate removal</dc:subject>
<dc:subject>web page cleaning</dc:subject>
<dc:subject>WaC</dc:subject>
<dc:subject>Web as Corpus</dc:subject>
<dc:subject>training data</dc:subject>
<dc:subject>manual annotation</dc:subject>
<dc:description>The CANOLA Corpus is a visually annotated English web corpus for training classification engines to remove boiler plate on unseen Web pages. It was harvested, annotated and evaluated by the tools and infrastructure of the KrdWrd Project.</dc:description>
<dc:date>2010-09-10</dc:date>
<dc:type>corpus</dc:type>
<dc:identifier>http://hdl.handle.net/20.500.12124/8</dc:identifier>
<dc:language>eng</dc:language>
<dc:relation>https://github.com/krdwrd/data/releases/tag/v1.0</dc:relation>
<dc:relation>https://www.sigwac.org.uk/raw-attachment/wiki/WAC5/WAC5_proceedings.pdf</dc:relation>
<dc:relation>http://hdl.handle.net/20.500.12124/9</dc:relation>
<dc:rights>Creative Commons - Attribution-ShareAlike 4.0 International (CC BY-SA 4.0)</dc:rights>
<dc:rights>https://creativecommons.org/licenses/by-sa/4.0/</dc:rights>
<dc:rights>PUB</dc:rights>
<dc:format>application/gzip</dc:format>
<dc:format>text/plain; charset=utf-8</dc:format>
<dc:format>downloadable_files_count: 1</dc:format>
<dc:publisher>Institute for Applied Linguistics, Eurac Research</dc:publisher>
<dc:source>https://krdwrd.github.io</dc:source>
</oai_dc:dc>
</metadata></record></GetRecord></OAI-PMH>