From: Simon Suchomel Date: Thu, 30 May 2013 11:02:07 +0000 (+0200) Subject: Ctvrtecni rane upravy X-Git-Tag: odeslano-20130601-2314~12 X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?p=pan13-paper.git;a=commitdiff_plain;h=c89f0f7c72770832556ef260515fa625cae4190a Ctvrtecni rane upravy --- diff --git a/pan13-paper/pan13-notebook.aux b/pan13-paper/pan13-notebook.aux index 65d20f4..8691625 100644 --- a/pan13-paper/pan13-notebook.aux +++ b/pan13-paper/pan13-notebook.aux @@ -12,8 +12,10 @@ \bibstyle{splncs03} \bibdata{pan13-notebook} \bibcite{ententen}{1} -\bibcite{Introduction_to_information_retrieval}{2} -\bibcite{chatnoir}{3} -\bibcite{suchomel_kas_12}{4} -\bibcite{SpiderLink}{5} -\@writefile{toc}{\contentsline {section}{\numberline {4}Conclusions}{6}} +\bibcite{awfc}{2} +\bibcite{Introduction_to_information_retrieval}{3} +\bibcite{chatnoir}{4} +\bibcite{plagCorpus}{5} +\bibcite{suchomel_kas_12}{6} +\bibcite{SpiderLink}{7} +\@writefile{toc}{\contentsline {section}{\numberline {4}Conclusions}{7}} diff --git a/pan13-paper/pan13-notebook.bib b/pan13-paper/pan13-notebook.bib index 164e961..224791d 100755 --- a/pan13-paper/pan13-notebook.bib +++ b/pan13-paper/pan13-notebook.bib @@ -1,12 +1,12 @@ @INPROCEEDINGS{chatnoir, AUTHOR = {Martin Potthast and Matthias Hagen and Benno Stein and Jan Gra{\ss}egger and Maximilian Michel and Martin Tippmann and Clement Welsch}, BOOKTITLE = {35th International ACM Conference on Research and Development in Information Retrieval (SIGIR 12)}, - DOI = {}, + DOI = {http://dx.doi.org/10.1145/2348283.2348429}, EDITOR = {Bill Hersh and Jamie Callan and Yoelle Maarek and Mark Sanderson}, - ISBN = {}, + ISBN = {978-1-4503-1472-5}, MONTH = aug, - PAGES = {}, - PUBLISHER = {}, + PAGES = {1004}, + PUBLISHER = {ACM}, SITE = {Portland, Oregon}, TITLE = {{ChatNoir: A Search Engine for the ClueWeb09 Corpus}}, YEAR = {2012} @@ -34,7 +34,7 @@ added-at = {2012-05-30T10:50:27.000+0200}, address = {Cambridge, UK}, author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch{\"u}tze, Hinrich}, - biburl = {http://www.bibsonomy.org/bibtex/28516d94c1f7aa1e391ddd3ace4caa23b/flint63}, + biburl = {\url{http://www.bibsonomy.org/bibtex/28516d94c1f7aa1e391ddd3ace4caa23b/flint63}}, file = {Cambridge University Press Product Page:http\://www.cambridge.org/9780521865715:URL;Amazon Search inside:http\://www.amazon.de/gp/reader/0521865719/:URL;Google Books:http\://books.google.de/books?isbn=978-0-521-86571-5:URL}, PAGES = {118-120}, groups = {public}, @@ -57,7 +57,7 @@ } @inproceedings{spiderLink, - author = {Suchomel, V. and Pomik{a'}lek, J.}, + author = {Suchomel, V. and Pomik{\'a}lek, J.}, booktitle = {Proceedings of the seventh Web as Corpus Workshop (WAC7)}, pages = {39-43}, editor = {Adam Kilgarriff and Serge Sharoff}, @@ -71,3 +71,17 @@ booktitle = {Proceedings of the European Conference on Information Retrieval (ECIR-06)}, year = {2006} } + +@INPROCEEDINGS{plagCorpus, + AUTHOR = {Martin Potthast and Matthias Hagen and Michael V{\"o}lske and Benno Stein}, + BOOKTITLE = {51st Annual Meeting of the Association of Computational Linguistics (ACL 13) -- (to appear) }, + DOI = {http://dx.doi.org/}, + EDITOR = {}, + MONTH = aug, + PAGES = {}, + PUBLISHER = {ACM}, + SITE = sofia, + TITLE = {Crowdsourcing Interaction Logs to Understand Text Reuse from the Web}, + URL = {}, + YEAR = {2013} +} diff --git a/pan13-paper/pan13-notebook.log b/pan13-paper/pan13-notebook.log index bb8b059..4efdadb 100644 --- a/pan13-paper/pan13-notebook.log +++ b/pan13-paper/pan13-notebook.log @@ -1,4 +1,4 @@ -This is pdfeTeXk, Version 3.141592-1.11a-2.1 (Web2C 7.5.2) (format=pdflatex 2011.8.15) 28 MAY 2013 21:56 +This is pdfeTeXk, Version 3.141592-1.11a-2.1 (Web2C 7.5.2) (format=pdflatex 2011.8.15) 30 MAY 2013 13:01 entering extended mode %&-line parsing enabled. **pan13-notebook.tex @@ -294,36 +294,39 @@ LaTeX Font Info: External font `cmex10' loaded for size <./img/source_retrieval_process.pdf>] LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <10> not available -(Font) Font shape `T1/ptm/b/n' tried instead on input line 49. +(Font) Font shape `T1/ptm/b/n' tried instead on input line 50. [3] -LaTeX Warning: Citation `awfc' on page 4 undefined on input line 92. +LaTeX Warning: Reference `resSelection' on page 4 undefined on input line 127. -) [4] +[4]) [5] \openout2 = `yenya-text_alignment.aux'. - (./yenya-text_alignment.tex) [5 + (./yenya-text_alignment.tex) [6 -] (./pan13-notebook.bbl) [6 +] (./pan13-notebook.bbl) [7 ] (./pan13-notebook.aux (./simon-source_retrieval.aux) -(./yenya-text_alignment.aux)) ) +(./yenya-text_alignment.aux)) + +LaTeX Warning: There were undefined references. + + ) Here is how much of TeX's memory you used: - 1873 strings out of 94668 - 22750 string characters out of 1175711 - 78682 words of memory out of 1527924 - 4992 multiletter control sequences out of 10000+50000 + 1875 strings out of 94668 + 22776 string characters out of 1175711 + 78690 words of memory out of 1527932 + 4994 multiletter control sequences out of 10000+50000 47511 words of font info for 49 fonts, out of 1000000 for 2000 458 hyphenation exceptions out of 1000 29i,9n,21p,221b,226s stack positions out of 5000i,500n,6000p,200000b,40000s - 59 PDF objects out of 300000 + 63 PDF objects out of 300000 0 named destinations out of 131072 6 words of extra memory for PDF output out of 65536 -{/export/packages/share/texlive2003/texmf/dvips/ -psnfss/8r.enc} -Output written on pan13-notebook.pdf (6 pages, 152139 bytes). +{/export/packages/share/texlive2003/texmf/dvips/psnfss/8r.enc} +Output written on pan13-notebook.pdf (7 pages, 154804 bytes). diff --git a/pan13-paper/pan13-notebook.pdf b/pan13-paper/pan13-notebook.pdf index 8835ca1..d7ce950 100644 Binary files a/pan13-paper/pan13-notebook.pdf and b/pan13-paper/pan13-notebook.pdf differ diff --git a/pan13-paper/simon-source_retrieval.aux b/pan13-paper/simon-source_retrieval.aux index 44df3eb..95d16bb 100644 --- a/pan13-paper/simon-source_retrieval.aux +++ b/pan13-paper/simon-source_retrieval.aux @@ -1,8 +1,9 @@ \relax -\citation{chatnoir} +\citation{plagCorpus} \@writefile{toc}{\contentsline {section}{\numberline {2}Source Retrieval}{2}} \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Source retrieval process.}}{2}} \newlabel{fig:source_retr_process}{{1}{2}} +\citation{chatnoir} \citation{suchomel_kas_12} \citation{Introduction_to_information_retrieval} \citation{ententen} @@ -13,11 +14,12 @@ \citation{suchomel_kas_12} \@writefile{toc}{\contentsline {subsubsection}{Intrinsic Plagiarism Based Queries.}{4}} \@writefile{toc}{\contentsline {subsubsection}{Paragraph Based Queries.}{4}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Search Control}{4}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Result Selection}{4}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Snippet Control}{4}} +\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Search Control}{5}} +\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Result Selection}{5}} +\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Snippet Control}{5}} +\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}Source Retrieval Results}{5}} \@setckpt{simon-source_retrieval}{ -\setcounter{page}{5} +\setcounter{page}{6} \setcounter{equation}{0} \setcounter{enumi}{0} \setcounter{enumii}{0} @@ -27,7 +29,7 @@ \setcounter{mpfootnote}{0} \setcounter{part}{0} \setcounter{section}{2} -\setcounter{subsection}{4} +\setcounter{subsection}{5} \setcounter{subsubsection}{0} \setcounter{paragraph}{0} \setcounter{subparagraph}{0} diff --git a/pan13-paper/simon-source_retrieval.tex b/pan13-paper/simon-source_retrieval.tex index c27a16e..2cb1a8f 100755 --- a/pan13-paper/simon-source_retrieval.tex +++ b/pan13-paper/simon-source_retrieval.tex @@ -5,9 +5,9 @@ large corpus. Those candidate documents are usually further compared in detail w suspicious document. In the PAN 2013 source retrieval subtask the main goal was to identified web pages which have been used as a source of plagiarism for creation of the test corpus. -The test corpus contained XX documents each discussing one and only one theme. +The test corpus contained 58 documents each discussing one and only one theme. Those documents were created intentionally by - semiprofessional writers, thus they feature nearly realistic plagiarism cases. + semiprofessional writers, thus they feature nearly realistic plagiarism cases~\cite{plagCorpus}. Such conditions are similar to a realistic plagiarism detection scenario, such as for state of the art commercial plagiarism detection systems or the anti-plagiarism service developed on and utilized at the Masaryk University. The main difference between real-world corpus @@ -33,7 +33,8 @@ In real-world scenario the corpus is the whole Web and the search engine can be which scales to the size of the Web. This methodology is based on the fact that we do not possess enough resources to download and effectively process the whole corpus. In the case of PAN 2013 competition the corpus -of source documents is the ClueWeb\footnote{\url{http://lemurproject.org/clueweb09.php/}} corpus. +of source documents is the ClueWeb\footnote{\url{http://lemurproject.org/clueweb09.php/}} corpus. + As a document retrieval tool for the competition we utilized the ChatNoir~\cite{chatnoir} search engine which indexes the English subset of the ClueWeb. The reverse engineering decision process reside in creation of suitable queries on the basis of the suspicious document @@ -103,17 +104,47 @@ The intrinsic plagiarism based queries are positional. They carry the position o They are phrasal, since they represent a search for a specific sentence. And they are nondeterministic, because the representative sentence is selected randomly. +\subsubsection{Paragraph Based Queries.} +The purpose of paragraph based queries is to check some parts of the text in more depth. +Parts for which no similarity has been found during previous searches. +For this case we considered a paragraph as a minimum text chunk for plagiarism to occur. +It is discussible whether a plagiarist would be persecuted for plagiarizing only one sentence in a paragraph. +Also a detection of a specific sentence is very difficult if want to avoid exhaustive search approach. +If someone is to reuse some peace of continuous text, it would probably be no shorter than a paragraph. +Despite the fact, that paragraphs differ in length, we represent one paragraph by one query. -\subsubsection{Paragraph Based Queries.} -they were executed as the last asi az v search control -it would be extremely difficult to detect a single sentence other way than by exhaustive searching methods + +The paragraph based query was created from each paragraph of a suspicious document. +From each paragraph we extracted the longest sentence from which the query was constructed. +Ideally the extracted sentence should carry the highest information gain. +The query was maximally 10 words in length which is the upper bound of ChatNoir +and was constructed from the selected sentence by omitting stop words. \subsection{Search Control} -neoptimalizujeme na spravne utvorene dotazy z klicovych slov - stoji to vice dotazu +For each suspicious document we prepared all three types of queries during the first phase at once. +Queries were executed stepwise. +After processing each query the results were evaluated (see the following subsection~\ref{resSelection} for more details) and from +all textual similarities between each result and the suspicious document, the suspicious document intervals of those similarities +were marked as 'discovered'. +At first the keywords based queries. All of the keywords based queries were +always executed. +After having all the keywords based queries processed, the intrinsic plagiarism based queries were executed according to +their creation sequence. +Since they carry its position not all of the intrinsic plagiarism based queries were caried out. +During the execution, if any of the query position intersected with any of the 'discovered' interval, the +query was dropped out. In the same way, the last paragraph based queries were processed. + +This search control results in two major properties. Firstly, the source retrieval effort were increased +in parts of the suspicious document, where there have not yet been found any textual similarity. +Especially by the paragraph based queries. And secondly, after detection a similarity for a certain part of the text, +no more intentionally retrieval attempts for that part were effectuated. Meaning that all +discovered search engine results were evaluated, but there were executed no more queries regarding that passage. \subsection{Result Selection} + \subsection{Snippet Control} +\subsection{Source Retrieval Results} diff --git a/pan13-paper/yenya-text_alignment.aux b/pan13-paper/yenya-text_alignment.aux index d67cb0a..e01f6f1 100644 --- a/pan13-paper/yenya-text_alignment.aux +++ b/pan13-paper/yenya-text_alignment.aux @@ -1,8 +1,8 @@ \relax -\@writefile{toc}{\contentsline {section}{\numberline {3}Text Alignment}{5}} -\newlabel{text_alignment}{{3}{5}} +\@writefile{toc}{\contentsline {section}{\numberline {3}Text Alignment}{6}} +\newlabel{text_alignment}{{3}{6}} \@setckpt{yenya-text_alignment}{ -\setcounter{page}{6} +\setcounter{page}{7} \setcounter{equation}{0} \setcounter{enumi}{0} \setcounter{enumii}{0}