Ctvrtecni rane upravy

author Simon Suchomel <xsuchom1@anxur.fi.muni.cz>

Thu, 30 May 2013 11:02:07 +0000 (13:02 +0200)

committer Simon Suchomel <xsuchom1@anxur.fi.muni.cz>

Thu, 30 May 2013 11:02:07 +0000 (13:02 +0200)
author Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
Thu, 30 May 2013 11:02:07 +0000 (13:02 +0200)
committer Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
Thu, 30 May 2013 11:02:07 +0000 (13:02 +0200)
diff --git a/pan13-paper/pan13-notebook.aux b/pan13-paper/pan13-notebook.aux

index 65d20f402de9fce8e8fdb072de1d593c46827fb5..8691625c3e81a59a70ac898f1f542cc6d2799b3c 100644 (file)
--- a/pan13-paper/pan13-notebook.aux
+++ b/pan13-paper/pan13-notebook.aux
@@ -12,8 +12,10 @@
  \bibstyle{splncs03}
  \bibdata{pan13-notebook}
  \bibcite{ententen}{1}
-\bibcite{Introduction_to_information_retrieval}{2}
-\bibcite{chatnoir}{3}
-\bibcite{suchomel_kas_12}{4}
-\bibcite{SpiderLink}{5}
-\@writefile{toc}{\contentsline {section}{\numberline {4}Conclusions}{6}}
+\bibcite{awfc}{2}
+\bibcite{Introduction_to_information_retrieval}{3}
+\bibcite{chatnoir}{4}
+\bibcite{plagCorpus}{5}
+\bibcite{suchomel_kas_12}{6}
+\bibcite{SpiderLink}{7}
+\@writefile{toc}{\contentsline {section}{\numberline {4}Conclusions}{7}}
diff --git a/pan13-paper/pan13-notebook.bib b/pan13-paper/pan13-notebook.bib

index 164e9618d0c19cb2315f80b6311048e3845293a0..224791d8aa01f498984a22b9e3cba0e853dc0f26 100755 (executable)
--- a/pan13-paper/pan13-notebook.bib
+++ b/pan13-paper/pan13-notebook.bib
@@ -1,12 +1,12 @@
  @INPROCEEDINGS{chatnoir,\r
          AUTHOR             = {Martin Potthast and Matthias Hagen and Benno Stein and Jan Gra{\ss}egger and Maximilian Michel and Martin Tippmann and Clement Welsch},\r
          BOOKTITLE          = {35th International ACM Conference on Research and Development in Information Retrieval (SIGIR 12)},\r
-        DOI                = {},\r
+        DOI                = {http://dx.doi.org/10.1145/2348283.2348429},\r
          EDITOR             = {Bill Hersh and Jamie Callan and Yoelle Maarek and Mark Sanderson},\r
-        ISBN               = {},\r
+        ISBN               = {978-1-4503-1472-5},\r
          MONTH              = aug,\r
-        PAGES              = {},\r
-        PUBLISHER          = {},\r
+        PAGES              = {1004},\r
+        PUBLISHER          = {ACM},\r
          SITE               = {Portland, Oregon},\r
          TITLE              = {{ChatNoir: A Search Engine for the ClueWeb09 Corpus}},\r
          YEAR               = {2012}\r
@@ -34,7 +34,7 @@
    added-at = {2012-05-30T10:50:27.000+0200},\r
    address = {Cambridge, UK},\r
    author = {Manning, Christopher D. and Raghavan, Prabhakar and Sch{\"u}tze, Hinrich},\r
-  biburl = {http://www.bibsonomy.org/bibtex/28516d94c1f7aa1e391ddd3ace4caa23b/flint63},\r
+  biburl = {\url{http://www.bibsonomy.org/bibtex/28516d94c1f7aa1e391ddd3ace4caa23b/flint63}},\r
    file = {Cambridge University Press Product Page:http\://www.cambridge.org/9780521865715:URL;Amazon Search inside:http\://www.amazon.de/gp/reader/0521865719/:URL;Google Books:http\://books.google.de/books?isbn=978-0-521-86571-5:URL},\r
    PAGES              = {118-120},\r
    groups = {public},\r
@@ -57,7 +57,7 @@
  }\r
  \r
  @inproceedings{spiderLink,\r
-  author = {Suchomel, V. and Pomik{a'}lek, J.},\r
+  author = {Suchomel, V. and Pomik{\'a}lek, J.},\r
    booktitle = {Proceedings of the seventh Web as Corpus Workshop (WAC7)},\r
    pages  = {39-43},\r
    editor = {Adam Kilgarriff and Serge Sharoff},\r
@@ -71,3 +71,17 @@
      booktitle = {Proceedings of the European Conference on Information Retrieval (ECIR-06)},\r
      year = {2006}\r
  }\r
+\r
+@INPROCEEDINGS{plagCorpus,\r
+        AUTHOR             = {Martin Potthast and Matthias Hagen and Michael V{\"o}lske and Benno Stein},\r
+        BOOKTITLE          = {51st Annual Meeting of the Association of Computational Linguistics (ACL 13) -- (to appear) },\r
+        DOI                = {http://dx.doi.org/},\r
+        EDITOR             = {},\r
+        MONTH              = aug,\r
+        PAGES              = {},\r
+        PUBLISHER          = {ACM},\r
+        SITE               = sofia,\r
+        TITLE              = {Crowdsourcing Interaction Logs to Understand Text Reuse from the Web},\r
+        URL                = {},\r
+        YEAR               = {2013}\r
+}\r
diff --git a/pan13-paper/pan13-notebook.log b/pan13-paper/pan13-notebook.log

index bb8b05939044829057bf616fcadae03cd6a9533e..4efdadb48893801d8ff5b21af5676b7c92d87146 100644 (file)
--- a/pan13-paper/pan13-notebook.log
+++ b/pan13-paper/pan13-notebook.log
@@ -1,4 +1,4 @@
-This is pdfeTeXk, Version 3.141592-1.11a-2.1 (Web2C 7.5.2) (format=pdflatex 2011.8.15)  28 MAY 2013 21:56
+This is pdfeTeXk, Version 3.141592-1.11a-2.1 (Web2C 7.5.2) (format=pdflatex 2011.8.15)  30 MAY 2013 13:01
  entering extended mode
   %&-line parsing enabled.
  **pan13-notebook.tex
@@ -294,36 +294,39 @@ LaTeX Font Info:    External font `cmex10' loaded for size
  
   <./img/source_retrieval_process.pdf>]
  LaTeX Font Info:    Font shape `T1/ptm/bx/n' in size <10> not available
-(Font)              Font shape `T1/ptm/b/n' tried instead on input line 49.
+(Font)              Font shape `T1/ptm/b/n' tried instead on input line 50.
  [3]
  
-LaTeX Warning: Citation `awfc' on page 4 undefined on input line 92.
+LaTeX Warning: Reference `resSelection' on page 4 undefined on input line 127.
  
-) [4]
+[4]) [5]
  \openout2 = `yenya-text_alignment.aux'.
  
- (./yenya-text_alignment.tex) [5
+ (./yenya-text_alignment.tex) [6
  
  
-] (./pan13-notebook.bbl) [6
+] (./pan13-notebook.bbl) [7
  
  ]
  (./pan13-notebook.aux (./simon-source_retrieval.aux)
-(./yenya-text_alignment.aux)) ) 
+(./yenya-text_alignment.aux))
+
+LaTeX Warning: There were undefined references.
+
+ ) 
  Here is how much of TeX's memory you used:
- 1873 strings out of 94668
- 22750 string characters out of 1175711
- 78682 words of memory out of 1527924
- 4992 multiletter control sequences out of 10000+50000
+ 1875 strings out of 94668
+ 22776 string characters out of 1175711
+ 78690 words of memory out of 1527932
+ 4994 multiletter control sequences out of 10000+50000
   47511 words of font info for 49 fonts, out of 1000000 for 2000
   458 hyphenation exceptions out of 1000
   29i,9n,21p,221b,226s stack positions out of 5000i,500n,6000p,200000b,40000s
- 59 PDF objects out of 300000
+ 63 PDF objects out of 300000
   0 named destinations out of 131072
   6 words of extra memory for PDF output out of 65536
-{/export/packages/share/texlive2003/texmf/dvips/
-psnfss/8r.enc}</export/packages/share/texlive2003/texmf/fonts/type1/urw/courier
-/ucrr8a.pfb></export/packages/share/texlive2003/texmf/fonts/type1/urw/times/utm
-r8a.pfb></export/packages/share/texlive2003/texmf/fonts/type1/urw/times/utmb8a.
-pfb>
-Output written on pan13-notebook.pdf (6 pages, 152139 bytes).
+{/export/packages/share/texlive2003/texmf/dvips/psnfss/8r.enc}</export/packag
+es/share/texlive2003/texmf/fonts/type1/urw/courier/ucrr8a.pfb></export/packages
+/share/texlive2003/texmf/fonts/type1/urw/times/utmr8a.pfb></export/packages/sha
+re/texlive2003/texmf/fonts/type1/urw/times/utmb8a.pfb>
+Output written on pan13-notebook.pdf (7 pages, 154804 bytes).
diff --git a/pan13-paper/pan13-notebook.pdf b/pan13-paper/pan13-notebook.pdf

index 8835ca1959f446a9b749af739c62f83f61566d36..d7ce95027ac6633d065db1526ad1e580c373a2a7 100644 (file)

Binary files a/pan13-paper/pan13-notebook.pdf and b/pan13-paper/pan13-notebook.pdf differ
diff --git a/pan13-paper/simon-source_retrieval.aux b/pan13-paper/simon-source_retrieval.aux

index 44df3eb2abd032f10693840d1fa17fcd95448a01..95d16bbbed8d4ade315417efe00131761216efd4 100644 (file)
--- a/pan13-paper/simon-source_retrieval.aux
+++ b/pan13-paper/simon-source_retrieval.aux
@@ -1,8 +1,9 @@
  \relax 
-\citation{chatnoir}
+\citation{plagCorpus}
  \@writefile{toc}{\contentsline {section}{\numberline {2}Source Retrieval}{2}}
  \@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Source retrieval process.}}{2}}
  \newlabel{fig:source_retr_process}{{1}{2}}
+\citation{chatnoir}
  \citation{suchomel_kas_12}
  \citation{Introduction_to_information_retrieval}
  \citation{ententen}
@@ -13,11 +14,12 @@
  \citation{suchomel_kas_12}
  \@writefile{toc}{\contentsline {subsubsection}{Intrinsic Plagiarism Based Queries.}{4}}
  \@writefile{toc}{\contentsline {subsubsection}{Paragraph Based Queries.}{4}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Search Control}{4}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Result Selection}{4}}
-\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Snippet Control}{4}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Search Control}{5}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Result Selection}{5}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Snippet Control}{5}}
+\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}Source Retrieval Results}{5}}
  \@setckpt{simon-source_retrieval}{
-\setcounter{page}{5}
+\setcounter{page}{6}
  \setcounter{equation}{0}
  \setcounter{enumi}{0}
  \setcounter{enumii}{0}
@@ -27,7 +29,7 @@
  \setcounter{mpfootnote}{0}
  \setcounter{part}{0}
  \setcounter{section}{2}
-\setcounter{subsection}{4}
+\setcounter{subsection}{5}
  \setcounter{subsubsection}{0}
  \setcounter{paragraph}{0}
  \setcounter{subparagraph}{0}
diff --git a/pan13-paper/simon-source_retrieval.tex b/pan13-paper/simon-source_retrieval.tex

index c27a16e627ebf792f05603fa21f075d4f2460e0c..2cb1a8f9c4f32945ef00a87e94c8fcc2fd9730c3 100755 (executable)
--- a/pan13-paper/simon-source_retrieval.tex
+++ b/pan13-paper/simon-source_retrieval.tex
@@ -5,9 +5,9 @@ large corpus. Those candidate documents are usually further compared in detail w
  suspicious document. In the PAN 2013 source retrieval subtask the main goal was to\r
  identified web pages which have been used as a source of plagiarism for creation of the \r
  test corpus. \r
-The test corpus contained XX documents each discussing one and only one theme.\r
+The test corpus contained 58 documents each discussing one and only one theme.\r
  Those documents were created intentionally by\r
- semiprofessional writers, thus they feature nearly realistic plagiarism cases. \r
+ semiprofessional writers, thus they feature nearly realistic plagiarism cases~\cite{plagCorpus}. \r
   Such conditions are similar to a realistic plagiarism detection scenario, such as for\r
  state of the art commercial plagiarism detection systems or the anti-plagiarism service developed on and\r
  utilized at the Masaryk University. The main difference between real-world corpus \r
@@ -33,7 +33,8 @@ In real-world scenario the corpus is the whole Web and the search engine can be
  which scales to the size of the Web. This methodology is based on the fact that we do not\r
  possess enough resources to download and effectively process the whole corpus.\r
  In the case of PAN 2013 competition the corpus\r
-of source documents is the ClueWeb\footnote{\url{http://lemurproject.org/clueweb09.php/}} corpus. \r
+of source documents is the ClueWeb\footnote{\url{http://lemurproject.org/clueweb09.php/}} corpus.\r
+\r
  As a document retrieval tool for the competition we utilized the ChatNoir~\cite{chatnoir} search engine which indexes the English\r
  subset of the ClueWeb.   \r
  The reverse engineering decision process reside in creation of suitable queries on the basis of the suspicious document\r
@@ -103,17 +104,47 @@ The intrinsic plagiarism based queries are positional. They carry the position o
  They are phrasal, since they represent a search for a specific sentence. And they are\r
  nondeterministic, because the representative sentence is selected randomly. \r
   \r
+\subsubsection{Paragraph Based Queries.}\r
+The purpose of paragraph based queries is to check some parts of the text in more depth.\r
+Parts for which no similarity has been found during previous searches. \r
  \r
+For this case we considered a paragraph as a minimum text chunk for plagiarism to occur. \r
+It is discussible whether a plagiarist would be persecuted for plagiarizing only one sentence in a paragraph.\r
+Also a detection of a specific sentence is very difficult if want to avoid exhaustive search approach.\r
+If someone is to reuse some peace of continuous text, it would probably be no shorter than a paragraph. \r
+Despite the fact, that paragraphs differ in length, we represent one paragraph by one query.\r
  \r
-\subsubsection{Paragraph Based Queries.}\r
-they were executed as the last asi az v search control\r
-it would be extremely difficult to detect a single sentence other way than by exhaustive searching methods\r
+\r
+The paragraph based query was created from each paragraph of a suspicious document.\r
+From each paragraph we extracted the longest sentence from which the query was constructed.\r
+Ideally the extracted sentence should carry the highest information gain.\r
+The query was maximally 10 words in length which is the upper bound of ChatNoir\r
+and was constructed from the selected sentence by omitting stop words.\r
  \r
  \subsection{Search Control}\r
-neoptimalizujeme na spravne utvorene dotazy z klicovych slov - stoji to vice dotazu\r
+For each suspicious document we prepared all three types of queries during the first phase at once.\r
+Queries were executed stepwise. \r
+After processing each query the results were evaluated (see the following subsection~\ref{resSelection} for more details) and from\r
+all textual similarities between each result and the suspicious document, the suspicious document intervals of those similarities\r
+were marked as 'discovered'. \r
+At first the keywords based queries. All of the keywords based queries were\r
+always executed. \r
+After having all the keywords based queries processed, the intrinsic plagiarism based queries were executed according to \r
+their creation sequence. \r
+Since they carry its position not all of the intrinsic plagiarism based queries were caried out.\r
+During the execution, if any of the query position intersected with any of the 'discovered' interval, the\r
+query was dropped out. In the same way, the last paragraph based queries were processed. \r
+\r
+This search control results in two major properties. Firstly, the source retrieval effort were increased \r
+in parts of the suspicious document, where there have not yet been found any textual similarity.\r
+Especially by the paragraph based queries. And secondly, after detection a similarity for a certain part of the text,\r
+no more intentionally retrieval attempts for that part were effectuated. Meaning that all\r
+discovered search engine results were evaluated, but there were executed no more queries regarding that passage.\r
  \r
  \r
  \subsection{Result Selection}\r
+\r
  \subsection{Snippet Control}\r
+\subsection{Source Retrieval Results}\r
  \r
  \r
diff --git a/pan13-paper/yenya-text_alignment.aux b/pan13-paper/yenya-text_alignment.aux

index d67cb0ac03a05d442e915809dfb98ef4fa430653..e01f6f19bd405b269b13975634257b8b269ec128 100644 (file)
--- a/pan13-paper/yenya-text_alignment.aux
+++ b/pan13-paper/yenya-text_alignment.aux
@@ -1,8 +1,8 @@
  \relax 
-\@writefile{toc}{\contentsline {section}{\numberline {3}Text Alignment}{5}}
-\newlabel{text_alignment}{{3}{5}}
+\@writefile{toc}{\contentsline {section}{\numberline {3}Text Alignment}{6}}
+\newlabel{text_alignment}{{3}{6}}
  \@setckpt{yenya-text_alignment}{
-\setcounter{page}{6}
+\setcounter{page}{7}
  \setcounter{equation}{0}
  \setcounter{enumi}{0}
  \setcounter{enumii}{0}
author	Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
	Thu, 30 May 2013 11:02:07 +0000 (13:02 +0200)
committer	Simon Suchomel <xsuchom1@anxur.fi.muni.cz>
	Thu, 30 May 2013 11:02:07 +0000 (13:02 +0200)
pan13-paper/pan13-notebook.aux		patch \| blob \| history
pan13-paper/pan13-notebook.bib		patch \| blob \| history
pan13-paper/pan13-notebook.log		patch \| blob \| history
pan13-paper/pan13-notebook.pdf		patch \| blob \| history
pan13-paper/simon-source_retrieval.aux		patch \| blob \| history
pan13-paper/simon-source_retrieval.tex		patch \| blob \| history
pan13-paper/yenya-text_alignment.aux		patch \| blob \| history