From: Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
Date: Thu, 30 May 2013 15:55:04 +0000 (+0200)
Subject: Merge http://www.fi.muni.cz/~kas/git/pan13-paper
X-Git-Tag: odeslano-20130601-2314~10
X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?a=commitdiff_plain;h=eafe3e22e26382588563ac39d6f88dd022c740da;hp=d099b098d7b507d64ad54ddf73f9fbd489a0e95f;p=pan13-paper.git

Merge http://www.fi.muni.cz/~kas/git/pan13-paper

Conflicts:
	pan13-paper/pan13-notebook.bib
---

diff --git a/pan13-paper/img/snippets_graph.pdf b/pan13-paper/img/snippets_graph.pdf
new file mode 100755
index 0000000..7441e98
Binary files /dev/null and b/pan13-paper/img/snippets_graph.pdf differ
diff --git a/pan13-paper/pan13-notebook.bib b/pan13-paper/pan13-notebook.bib
index 224791d..59c4aa0 100755
--- a/pan13-paper/pan13-notebook.bib
+++ b/pan13-paper/pan13-notebook.bib
@@ -12,6 +12,7 @@
         YEAR               = {2012}
 }
 
+<<<<<<< HEAD
 @inproceedings{suchomel_kas_12,
   added-at = {2012-10-01T11:37:58.000+0200},
   author = {Suchomel, {\v S}imon and Kasprzak, Jan and Brandejs, Michal},
@@ -84,4 +85,44 @@
         TITLE              = {Crowdsourcing Interaction Logs to Understand Text Reuse from the Web},
         URL                = {},
         YEAR               = {2013}
+=======
+@INPROCEEDINGS{Kasprzak2009a,
+  AUTHOR =       "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c}",
+  TITLE =        "Finding Plagiarism by Evaluating Document Similarities",
+  BOOKTITLE =    "SEPLN'09: The 25th edition of the Annual Conference of the Spanish Society for Natural Language Processing",
+  YEAR =         "2009",
+  file = F
+}
+
+@inproceedings{Kasprzak2010,
+  title={Improving the reliability of the plagiarism detection system},
+  author={Kasprzak, J. and Brandejs, M.},
+  booktitle={Notebook Papers of CLEF 2010 LABs and Workshops},
+  year={2010},
+  organization={Citeseer}
+}
+
+@article{stamatatos2011plagiarism,
+  title={Plagiarism detection using stopword n-grams},
+  author={Stamatatos, E.},
+  journal={Journal of the American Society for Information Science and Technology},
+  year={2011},
+  publisher={Wiley Online Library}
+}
+
+@inproceedings{suchomel2012,
+  title={Three way search engine queries with multi-feature document comparison for plagiarism detection},
+  author={Suchomel, {\v{S}}imon and Kasprzak, Jan and Brandejs, Michal},
+  booktitle={CLEF (Online Working Notes/Labs/Workshop)},
+  pages={0--8},
+  year={2012}
+}
+
+@inproceedings{torrejondetailed,
+  title={Detailed Comparison Module In CoReMo 1.9 Plagiarism Detector},
+  author={Torrej{\'o}n, Diego A Rodr{\'\i}guez and Ramos, Jos{\'e} Manuel Mart{\'\i}n},
+  booktitle={CLEF (Online Working Notes/Labs/Workshop)},
+  pages={1--8},
+  year={2012}
+>>>>>>> 2278ad058d0a6e0c2228741c76aece9ace432912
 }
diff --git a/pan13-paper/yenya-text_alignment.tex b/pan13-paper/yenya-text_alignment.tex
index 7a93e50..2f4a2d3 100755
--- a/pan13-paper/yenya-text_alignment.tex
+++ b/pan13-paper/yenya-text_alignment.tex
@@ -1 +1,51 @@
 \section{Text Alignment}~\label{text_alignment}
+
+\subsection{Overview}
+
+Our approach at the text alignment subtask of PAN 2013 uses the same
+basic principles as our previous work in this area, described
+in \cite{Suchomel2012}, which in turn builds on our work for previous
+PAN campaigns,, \cite{Kasprzak2010}, \cite{Kasprzak2009a}:
+
+We detect {\it common features} between source and suspicious documents,
+where features we currently use are word $n$-grams, and stop-word $m$-grams
+\cite{stamatatos2011plagiarism}. From those common features (each of which
+can occur multiple times in both source and suspicious document), we form
+{\it valid intervals}\footnote{%
+We describe the algorithm for computing valid intervals in \cite{Kasprzak2009a},
+and a similar approach is also used in \cite{stamatatos2011plagiarism}.}
+of characters
+from the source and suspicious documents, where the interval in both
+of these documents is covered ``densely enough'' by the common features.
+
+We then postprocess the valid intervals, removing overlapping detections,
+and merging detections which are close enough to each other.
+
+In the next sections, we summarize the modifications we did for PAN 2013,
+including approaches tried but not used. For the training corpus,
+our software from PAN 2012 gave the plagdet score of TODO, which we
+consider the baseline for further improvements.
+
+\subsection{Alternative features}
+
+TODO \cite{torrejondetailed}
+
+\subsection{Global postprocessing}
+
+For PAN 2013, the algorithm had access to all of the source and suspicious
+documents. Because of this, we have rewritten our software to handle
+all of the documents at once, in order to be able to do cross-document
+optimizations and postprocessing, similar to what we did for PAN 2010.
+This required refactorization of most of the code. We are able to handle
+most of the computation in parallel in per-CPU threads, with little
+synchronization needed. The parallelization was used especially
+for development, where it has provided a significant performance boost.
+The official performance numbers are from single-threaded run, though.
+
+For PAN 2010, we have used the following postprocessing heuristics:
+If there are overlapping detections inside a suspicious document,
+keep the longer one, provided that it is long enough. For overlapping
+detections up to 600 characters, 
+TODO
+
+