From ab24ba815bdf84d55df641c0d03d4dc93ad0453e Mon Sep 17 00:00:00 2001 From: Simon Suchomel Date: Thu, 30 May 2013 23:00:51 +0200 Subject: [PATCH] Ctvrtecni upravy --- pan13-paper/pan13-notebook.bib | 5 +-- pan13-paper/simon-source_retrieval.tex | 51 ++++++++++++++++++++++---- 2 files changed, 45 insertions(+), 11 deletions(-) diff --git a/pan13-paper/pan13-notebook.bib b/pan13-paper/pan13-notebook.bib index 59c4aa0..204d2fc 100755 --- a/pan13-paper/pan13-notebook.bib +++ b/pan13-paper/pan13-notebook.bib @@ -12,7 +12,6 @@ YEAR = {2012} } -<<<<<<< HEAD @inproceedings{suchomel_kas_12, added-at = {2012-10-01T11:37:58.000+0200}, author = {Suchomel, {\v S}imon and Kasprzak, Jan and Brandejs, Michal}, @@ -85,7 +84,8 @@ TITLE = {Crowdsourcing Interaction Logs to Understand Text Reuse from the Web}, URL = {}, YEAR = {2013} -======= +} + @INPROCEEDINGS{Kasprzak2009a, AUTHOR = "Jan Kasprzak and Michal Brandejs and Miroslav K\v{r}ipa\v{c}", TITLE = "Finding Plagiarism by Evaluating Document Similarities", @@ -124,5 +124,4 @@ booktitle={CLEF (Online Working Notes/Labs/Workshop)}, pages={1--8}, year={2012} ->>>>>>> 2278ad058d0a6e0c2228741c76aece9ace432912 } diff --git a/pan13-paper/simon-source_retrieval.tex b/pan13-paper/simon-source_retrieval.tex index d5b338b..29e9781 100755 --- a/pan13-paper/simon-source_retrieval.tex +++ b/pan13-paper/simon-source_retrieval.tex @@ -73,7 +73,7 @@ The method combines term frequency analysis with TF-IDF score~\cite{Introduction corpus we used English web corpus~\cite{ententen} crawled by SpiderLink~\cite{SpiderLink} in 2012 which contains 4.65 billion tokens. Each keywords based query were constructed from five top ranked keywords consecutively. Each keyword were -used only in one query. Too long keywords based queries would be over-specific and it would have resulted +used only in one query. Too long keywords based queries would be overspecific and it would have resulted in a low recall. On the other hand having constructed too short (one or two tokens) queries would have resulted in a low precision and also possibly low recall since they would be too general. @@ -143,7 +143,7 @@ no more intentionally retrieval attempts for that part were effectuated. Meaning discovered search engine results were evaluated, but there were executed no more queries regarding that passage. -\subsection{Result Selection} +\subsection{Result Selection}~\label{resSelection} The second main decisive area about source retrieval task is to decide which from the search engine results to download. This process is represented in figure~\ref{fig:source_retr_process} as 'Selecting'. Nowadays in real-world is download very cheap and quick operation. There can be some disk space considerations @@ -157,11 +157,6 @@ operation. The snippet purpose is to have a quick glance at a small extract of r The extract is maximally 500 characters long and it is a portion of the document around given keywords. On the basis of snippet, we needed to decide whether to actually download the result or not. -Since the snippet is relatively small and it can be discontinuous part of the text, the -text alignment methods described in section~\ref{text_alignment} were insufficient for - - - \subsection{Snippet Control} \begin{figure} \centering @@ -169,6 +164,46 @@ text alignment methods described in section~\ref{text_alignment} were insufficie \caption{Downloads and similarities performance.} \label{fig:snippet_graph} \end{figure} -\subsection{Source Retrieval Results} +Since the snippet is relatively small and it can be discontinuous part of the text, the +text alignment methods described in section~\ref{text_alignment} were insufficient +in decision making over document download. Therefore we chose to compare existence of snippet word tuples +in the suspicious document. For 1-tuples the measure means how many words from the snippet +also exist in the suspicious document. If the snippet contains many common words they may +also occur in many documents. In this case the 1-tuple measurement is little decisive. + +We used 2-tuples measurement, which indicates how many neighbouring word pairs coexist in the snippet and in the suspicious document. +We decided according to this value whether to download the source or not. For the deduction + of the threshold value we used 4413 search results from various queries according to documents + in the training corpus. Each resulting document was textually aligned to its corresponding suspicious document. +One similarity represents continuous passage of text alignment similarity as is described in the following section~\ref{text_alignment}. +In this way we obtained 248 similarities in total after downloading all of the 4431 documents. + +The 2-tuples similarity performance is depicted in figure~\ref{fig:snippet_graph}. +Horizontal axis represents threshold of the 2-tuples similarity percentage between the snippet and the suspicious document. +The graph curves represent obtain resource percentage according to the snippet similarity threshold. +A profitable threshold is the one with the largest distance between those two curves. +We chose threshold of the snippet similarity to 20\%, which in the graph corresponds to 20\% of all +downloads and simultaneously with 70\% discovered similarities. + +\subsection{Source Retrieval Results} +In PAN 2013 Source Retrieval subtask we competed with other 8 teams. +There can not be selected the best approach because there were several independent +performance measures. Possibly each approach has its pros and cons and many approaches +are usable in different situations. + +We believe that in the realistic plagiarism detection the most important is keeping the number of +queries low and simultaneously maximizing recall. +% It is often some tradeoff between cost and efectivness. +It is also advisable to keep the number of downloads down, but on the other hand, +it is relatively cheep and easily scalable operation. + +Our approach had the second best ration of recall to the number of used queries, which +tells about the query efficacy. The approach with the best ratio used few queries (4.9 queries per document which +was 0.4 of the amount we used), but also obtained the lowest recall (0.65 of our recall). +The approach with highest recall (and also lowest precision) achieved 2.8 times higher recall with 3.9 times more queries. + +Our approach achieved also low precision, which means we reported many more results and they +were not considered as correct hits. On the other hand each reported result contained some +textual similarity according to text alignment subtask score, which we believe is still worthwhile to report. -- 2.43.0