From: Simon Suchomel Date: Thu, 30 May 2013 15:55:04 +0000 (+0200) Subject: Merge http://www.fi.muni.cz/~kas/git/pan13-paper X-Git-Tag: odeslano-20130601-2314~10 X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=commitdiff_plain;h=eafe3e22e26382588563ac39d6f88dd022c740da;hp=d099b098d7b507d64ad54ddf73f9fbd489a0e95f;p=pan13-paper.git Merge http://www.fi.muni.cz/~kas/git/pan13-paper Conflicts: pan13-paper/pan13-notebook.bib --- diff --git a/pan13-paper/img/snippets_graph.pdf b/pan13-paper/img/snippets_graph.pdf new file mode 100755 index 0000000..7441e98 Binary files /dev/null and b/pan13-paper/img/snippets_graph.pdf differ diff --git a/pan13-paper/pan13-notebook.bib b/pan13-paper/pan13-notebook.bib index 224791d..59c4aa0 100755 --- a/pan13-paper/pan13-notebook.bib +++ b/pan13-paper/pan13-notebook.bib @@ -12,6 +12,7 @@ YEAR = {2012} } +<<<<<<< HEAD @inproceedings{suchomel_kas_12, added-at = {2012-10-01T11:37:58.000+0200}, author = {Suchomel, {\v S}imon and Kasprzak, Jan and Brandejs, Michal}, @@ -84,4 +85,44 @@ TITLE = {Crowdsourcing Interaction Logs to Understand Text Reuse from the Web}, URL = {}, YEAR = {2013} +======= +@INPROCEEDINGS{Kasprzak2009a, + AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c}", + TITLE = "Finding Plagiarism by Evaluating Document Similarities", + BOOKTITLE = "SEPLN'09: The 25th edition of the Annual Conference of the Spanish Society for Natural Language Processing", + YEAR = "2009", + file = F +} + +@inproceedings{Kasprzak2010, + title={Improving the reliability of the plagiarism detection system}, + author={Kasprzak, J. and Brandejs, M.}, + booktitle={Notebook Papers of CLEF 2010 LABs and Workshops}, + year={2010}, + organization={Citeseer} +} + +@article{stamatatos2011plagiarism, + title={Plagiarism detection using stopword n-grams}, + author={Stamatatos, E.}, + journal={Journal of the American Society for Information Science and Technology}, + year={2011}, + publisher={Wiley Online Library} +} + +@inproceedings{suchomel2012, + title={Three way search engine queries with multi-feature document comparison for plagiarism detection}, + author={Suchomel, {\v{S}}imon and Kasprzak, Jan and Brandejs, Michal}, + booktitle={CLEF (Online Working Notes/Labs/Workshop)}, + pages={0--8}, + year={2012} +} + +@inproceedings{torrejondetailed, + title={Detailed Comparison Module In CoReMo 1.9 Plagiarism Detector}, + author={Torrej{\'o}n, Diego A Rodr{\'\i}guez and Ramos, Jos{\'e} Manuel Mart{\'\i}n}, + booktitle={CLEF (Online Working Notes/Labs/Workshop)}, + pages={1--8}, + year={2012} +>>>>>>> 2278ad058d0a6e0c2228741c76aece9ace432912 } diff --git a/pan13-paper/yenya-text_alignment.tex b/pan13-paper/yenya-text_alignment.tex index 7a93e50..2f4a2d3 100755 --- a/pan13-paper/yenya-text_alignment.tex +++ b/pan13-paper/yenya-text_alignment.tex @@ -1 +1,51 @@ \section{Text Alignment}~\label{text_alignment} + +\subsection{Overview} + +Our approach at the text alignment subtask of PAN 2013 uses the same +basic principles as our previous work in this area, described +in \cite{Suchomel2012}, which in turn builds on our work for previous +PAN campaigns,, \cite{Kasprzak2010}, \cite{Kasprzak2009a}: + +We detect {\it common features} between source and suspicious documents, +where features we currently use are word $n$-grams, and stop-word $m$-grams +\cite{stamatatos2011plagiarism}. From those common features (each of which +can occur multiple times in both source and suspicious document), we form +{\it valid intervals}\footnote{% +We describe the algorithm for computing valid intervals in \cite{Kasprzak2009a}, +and a similar approach is also used in \cite{stamatatos2011plagiarism}.} +of characters +from the source and suspicious documents, where the interval in both +of these documents is covered ``densely enough'' by the common features. + +We then postprocess the valid intervals, removing overlapping detections, +and merging detections which are close enough to each other. + +In the next sections, we summarize the modifications we did for PAN 2013, +including approaches tried but not used. For the training corpus, +our software from PAN 2012 gave the plagdet score of TODO, which we +consider the baseline for further improvements. + +\subsection{Alternative features} + +TODO \cite{torrejondetailed} + +\subsection{Global postprocessing} + +For PAN 2013, the algorithm had access to all of the source and suspicious +documents. Because of this, we have rewritten our software to handle +all of the documents at once, in order to be able to do cross-document +optimizations and postprocessing, similar to what we did for PAN 2010. +This required refactorization of most of the code. We are able to handle +most of the computation in parallel in per-CPU threads, with little +synchronization needed. The parallelization was used especially +for development, where it has provided a significant performance boost. +The official performance numbers are from single-threaded run, though. + +For PAN 2010, we have used the following postprocessing heuristics: +If there are overlapping detections inside a suspicious document, +keep the longer one, provided that it is long enough. For overlapping +detections up to 600 characters, +TODO + +