From: Simon Suchomel Date: Thu, 30 May 2013 15:46:15 +0000 (+0200) Subject: Pisu dal X-Git-Tag: odeslano-20130601-2314~11 X-Git-Url: https://www.fi.muni.cz/~kas/git//home/kas/public_html/git/?p=pan13-paper.git;a=commitdiff_plain;h=d099b098d7b507d64ad54ddf73f9fbd489a0e95f Pisu dal --- diff --git a/pan13-paper/pan13-notebook.aux b/pan13-paper/pan13-notebook.aux deleted file mode 100644 index 8691625..0000000 --- a/pan13-paper/pan13-notebook.aux +++ /dev/null @@ -1,21 +0,0 @@ -\relax -\select@language{american} -\@writefile{toc}{\select@language{american}} -\@writefile{lof}{\select@language{american}} -\@writefile{lot}{\select@language{american}} -\@writefile{toc}{\contentsline {title}{Improving plagiarism detection}{1}} -\@writefile{toc}{\authcount {3}} -\@writefile{toc}{\contentsline {author}{\v {S}imon Suchomel \and Jan Kasprzak \and Michal Brandejs}{1}} -\@writefile{toc}{\contentsline {section}{\numberline {1}Introduction}{1}} -\@input{simon-source_retrieval.aux} -\@input{yenya-text_alignment.aux} -\bibstyle{splncs03} -\bibdata{pan13-notebook} -\bibcite{ententen}{1} -\bibcite{awfc}{2} -\bibcite{Introduction_to_information_retrieval}{3} -\bibcite{chatnoir}{4} -\bibcite{plagCorpus}{5} -\bibcite{suchomel_kas_12}{6} -\bibcite{SpiderLink}{7} -\@writefile{toc}{\contentsline {section}{\numberline {4}Conclusions}{7}} diff --git a/pan13-paper/pan13-notebook.log b/pan13-paper/pan13-notebook.log deleted file mode 100644 index 4efdadb..0000000 --- a/pan13-paper/pan13-notebook.log +++ /dev/null @@ -1,332 +0,0 @@ -This is pdfeTeXk, Version 3.141592-1.11a-2.1 (Web2C 7.5.2) (format=pdflatex 2011.8.15) 30 MAY 2013 13:01 -entering extended mode - %&-line parsing enabled. -**pan13-notebook.tex -(./pan13-notebook.tex{/export/packages/share/texlive2003/texmf-var/pdftex/confi -g/pdftex.cfg} -LaTeX2e <2001/06/01> -Babel and hyphenation patterns for english, french, german, ngerman, ca -talan, croatian, czech, danish, dutch, estonian, finnish, greek, hungarian, ita -lian, latin, mongolian, norwegian, polish, portuguese, romanian, russian, slove -ne, serbocroat, slovak, spanish, swedish, ukenglish, ukrainian, welsh, dumylang -, nohyphenation, loaded. -(./llncs.cls -Document Class: llncs 2010/04/15 v2.16 - LaTeX document class for Lecture Notes in Computer Science -(/export/packages/share/texlive2003/texmf/tex/latex/base/article.cls -Document Class: article 2001/04/21 v1.4e Standard LaTeX document class -(/export/packages/share/texlive2003/texmf/tex/latex/base/size10.clo -File: size10.clo 2001/04/21 v1.4e Standard LaTeX file (size option) -) -\c@part=\count79 -\c@section=\count80 -\c@subsection=\count81 -\c@subsubsection=\count82 -\c@paragraph=\count83 -\c@subparagraph=\count84 -\c@figure=\count85 -\c@table=\count86 -\abovecaptionskip=\skip41 -\belowcaptionskip=\skip42 -\bibindent=\dimen102 -) -(/export/packages/share/texlive2003/texmf/tex/latex/tools/multicol.sty -Package: multicol 2000/07/10 v1.5z multicolumn formatting (FMi) -\c@tracingmulticols=\count87 -\mult@box=\box26 -\multicol@leftmargin=\dimen103 -\c@unbalance=\count88 -\c@collectmore=\count89 -\doublecol@number=\count90 -\multicoltolerance=\count91 -\multicolpretolerance=\count92 -\full@width=\dimen104 -\page@free=\dimen105 -\premulticols=\dimen106 -\postmulticols=\dimen107 -\multicolsep=\skip43 -\multicolbaselineskip=\skip44 -\partial@page=\box27 -\last@line=\box28 -\mult@rightbox=\box29 -\mult@grightbox=\box30 -\mult@gfirstbox=\box31 -\mult@firstbox=\box32 -\@tempa=\box33 -\@tempa=\box34 -\@tempa=\box35 -\@tempa=\box36 -\@tempa=\box37 -\@tempa=\box38 -\@tempa=\box39 -\@tempa=\box40 -\@tempa=\box41 -\@tempa=\box42 -\@tempa=\box43 -\@tempa=\box44 -\@tempa=\box45 -\@tempa=\box46 -\@tempa=\box47 -\@tempa=\box48 -\@tempa=\box49 -\c@columnbadness=\count93 -\c@finalcolumnbadness=\count94 -\last@try=\dimen108 -\multicolovershoot=\dimen109 -\multicolundershoot=\dimen110 -\mult@nat@firstbox=\box50 -\colbreak@box=\box51 -) -(./aliascnt.sty -Package: aliascnt 2009/09/08 v1.3 Alias counter (HO) - -(/export/packages/share/texlive2003/texmf/tex/latex/carlisle/remreset.sty)) -\c@chapter=\count95 -LaTeX Font Info: Redeclaring math symbol \Gamma on input line 361. -LaTeX Font Info: Redeclaring math symbol \Delta on input line 362. -LaTeX Font Info: Redeclaring math symbol \Theta on input line 363. -LaTeX Font Info: Redeclaring math symbol \Lambda on input line 364. -LaTeX Font Info: Redeclaring math symbol \Xi on input line 365. -LaTeX Font Info: Redeclaring math symbol \Pi on input line 366. -LaTeX Font Info: Redeclaring math symbol \Sigma on input line 367. -LaTeX Font Info: Redeclaring math symbol \Upsilon on input line 368. -LaTeX Font Info: Redeclaring math symbol \Phi on input line 369. -LaTeX Font Info: Redeclaring math symbol \Psi on input line 370. -LaTeX Font Info: Redeclaring math symbol \Omega on input line 371. -\tocchpnum=\dimen111 -\tocsecnum=\dimen112 -\tocsectotal=\dimen113 -\tocsubsecnum=\dimen114 -\tocsubsectotal=\dimen115 -\tocsubsubsecnum=\dimen116 -\tocsubsubsectotal=\dimen117 -\tocparanum=\dimen118 -\tocparatotal=\dimen119 -\tocsubparanum=\dimen120 -\@tempcntc=\count96 -\fnindent=\dimen121 -\c@@inst=\count97 -\c@@auth=\count98 -\c@auco=\count99 -\instindent=\dimen122 -\authrun=\box52 -\authorrunning=\toks14 -\tocauthor=\toks15 -\titrun=\box53 -\titlerunning=\toks16 -\toctitle=\toks17 -\c@theorem=\count100 -\c@case=\count101 -\c@conjecture=\count102 -\c@corollary=\count103 -\c@definition=\count104 -\c@example=\count105 -\c@exercise=\count106 -\c@lemma=\count107 -\c@note=\count108 -\c@problem=\count109 -\c@property=\count110 -\c@proposition=\count111 -\c@question=\count112 -\c@solution=\count113 -\c@remark=\count114 -\headlineindent=\dimen123 -) -(/export/packages/share/texlive2003/texmf/tex/generic/babel/babel.sty -Package: babel 2001/03/01 v3.7h The Babel package - -(/export/packages/share/texlive2003/texmf/tex/generic/babel/english.ldf -Language: english 2001/04/15 v3.3l English support from the babel system - -(/export/packages/share/texlive2003/texmf/tex/generic/babel/babel.def -File: babel.def 2001/03/01 v3.7h Babel common definitions -\babel@savecnt=\count115 -\U@D=\dimen124 -) -\l@canadian = a dialect from \language\l@english -)) -(/export/packages/share/texlive2003/texmf/tex/latex/base/fontenc.sty -Package: fontenc 2001/06/05 v1.94 Standard LaTeX package - -(/export/packages/share/texlive2003/texmf/tex/latex/base/t1enc.def -File: t1enc.def 2001/06/05 v1.94 Standard LaTeX file -LaTeX Font Info: Redeclaring font encoding T1 on input line 38. -)) -(/export/packages/share/texlive2003/texmf/tex/latex/psnfss/times.sty -Package: times 2002/09/08 PSNFSS-v9.0a (SPQR) -) -(/export/packages/share/texlive2003/texmf/tex/latex/graphics/graphicx.sty -Package: graphicx 1999/02/16 v1.0f Enhanced LaTeX Graphics (DPC,SPQR) - -(/export/packages/share/texlive2003/texmf/tex/latex/graphics/keyval.sty -Package: keyval 1999/03/16 v1.13 key=value parser (DPC) -\KV@toks@=\toks18 -) -(/export/packages/share/texlive2003/texmf/tex/latex/graphics/graphics.sty -Package: graphics 2001/07/07 v1.0n Standard LaTeX Graphics (DPC,SPQR) - -(/export/packages/share/texlive2003/texmf/tex/latex/graphics/trig.sty -Package: trig 1999/03/16 v1.09 sin cos tan (DPC) -) -(/export/packages/share/texlive2003/texmf/tex/latex/texlive/graphics.cfg -File: graphics.cfg 2001/08/31 v1.1 graphics configuration of teTeX/TeXLive -) -Package graphics Info: Driver file: pdftex.def on input line 80. - -(/export/packages/share/texlive2003/texmf/tex/latex/pdftexdef/pdftex.def -File: pdftex.def 2002/06/19 v0.03k graphics/color for pdftex -\Gread@gobject=\count116 -)) -\Gin@req@height=\dimen125 -\Gin@req@width=\dimen126 -) -(./pan13-notebook.aux (./simon-source_retrieval.aux) -(./yenya-text_alignment.aux)) -\openout1 = `pan13-notebook.aux'. - -LaTeX Font Info: Checking defaults for OML/cmm/m/it on input line 8. -LaTeX Font Info: ... okay on input line 8. -LaTeX Font Info: Checking defaults for T1/cmr/m/n on input line 8. -LaTeX Font Info: ... okay on input line 8. -LaTeX Font Info: Checking defaults for OT1/cmr/m/n on input line 8. -LaTeX Font Info: ... okay on input line 8. -LaTeX Font Info: Checking defaults for OMS/cmsy/m/n on input line 8. -LaTeX Font Info: ... okay on input line 8. -LaTeX Font Info: Checking defaults for OMX/cmex/m/n on input line 8. -LaTeX Font Info: ... okay on input line 8. -LaTeX Font Info: Checking defaults for U/cmr/m/n on input line 8. -LaTeX Font Info: ... okay on input line 8. -LaTeX Font Info: Try loading font information for T1+ptm on input line 8. - -(/export/packages/share/texlive2003/texmf/tex/latex/psnfss/t1ptm.fd -File: t1ptm.fd 2001/06/04 font definitions for T1/ptm. -) -(/export/packages/share/texlive2003/texmf/tex/context/base/supp-pdf.tex -(/export/packages/share/texlive2003/texmf/tex/context/base/supp-mis.tex -loading : Context Support Macros / Miscellaneous -\protectiondepth=\count117 -\scratchcounter=\count118 -\scratchtoks=\toks19 -\scratchdimen=\dimen127 -\scratchskip=\skip45 -\scratchmuskip=\muskip10 -\scratchbox=\box54 -\scratchread=\read1 -\scratchwrite=\write3 -\zeropoint=\dimen128 -\minusone=\count119 -\thousandpoint=\dimen129 -\emptytoks=\toks20 -\nextbox=\box55 -\nextdepth=\dimen130 -\everyline=\toks21 -\!!counta=\count120 -\!!countb=\count121 -\recursecounter=\count122 -) -loading : Context Support Macros / PDF -\nofMPsegments=\count123 -\nofMParguments=\count124 -\everyMPtoPDFconversion=\toks22 -) -LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <14.4> not available -(Font) Font shape `T1/ptm/b/n' tried instead on input line 18. -LaTeX Font Info: External font `cmex10' loaded for size -(Font) <7> on input line 18. -LaTeX Font Info: External font `cmex10' loaded for size -(Font) <5> on input line 18. -LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <12> not available -(Font) Font shape `T1/ptm/b/n' tried instead on input line 18. -LaTeX Font Info: Try loading font information for T1+pcr on input line 18. - (/export/packages/share/texlive2003/texmf/tex/latex/psnfss/t1pcr.fd -File: t1pcr.fd 2001/06/04 font definitions for T1/pcr. -) -LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <9> not available -(Font) Font shape `T1/ptm/b/n' tried instead on input line 20. - [1 - -{/export/packages/share/texlive2003/texmf-var/fonts/map/pdftex/updmap/pdftex.ma -p -Warning: pdflatex (file /export/packages/share/texlive2003/texmf-var/fonts/map/ -pdftex/updmap/pdftex.map): entry for `tmbi108r' already exists, duplicates igno -red - -Warning: pdflatex (file /export/packages/share/texlive2003/texmf-var/fonts/map/ -pdftex/updmap/pdftex.map): entry for `tmbo108r' already exists, duplicates igno -red - -Warning: pdflatex (file /export/packages/share/texlive2003/texmf-var/fonts/map/ -pdftex/updmap/pdftex.map): entry for `tmbu108r' already exists, duplicates igno -red - -Warning: pdflatex (file /export/packages/share/texlive2003/texmf-var/fonts/map/ -pdftex/updmap/pdftex.map): entry for `tmrb108r' already exists, duplicates igno -red - -Warning: pdflatex (file /export/packages/share/texlive2003/texmf-var/fonts/map/ -pdftex/updmap/pdftex.map): entry for `tmri108r' already exists, duplicates igno -red - -Warning: pdflatex (file /export/packages/share/texlive2003/texmf-var/fonts/map/ -pdftex/updmap/pdftex.map): entry for `tmrm108r' already exists, duplicates igno -red - -Warning: pdflatex (file /export/packages/share/texlive2003/texmf-var/fonts/map/ -pdftex/updmap/pdftex.map): entry for `tmro108r' already exists, duplicates igno -red - -Warning: pdflatex (file /export/packages/share/texlive2003/texmf-var/fonts/map/ -pdftex/updmap/pdftex.map): entry for `tmui108r' already exists, duplicates igno -red -}] -\openout2 = `simon-source_retrieval.aux'. - - (./simon-source_retrieval.tex - -File: img/source_retrieval_process.pdf Graphic file (type pdf) - - -LaTeX Font Info: External font `cmex10' loaded for size -(Font) <9> on input line 36. -LaTeX Font Info: External font `cmex10' loaded for size -(Font) <6> on input line 36. - [2 - - <./img/source_retrieval_process.pdf>] -LaTeX Font Info: Font shape `T1/ptm/bx/n' in size <10> not available -(Font) Font shape `T1/ptm/b/n' tried instead on input line 50. -[3] - -LaTeX Warning: Reference `resSelection' on page 4 undefined on input line 127. - -[4]) [5] -\openout2 = `yenya-text_alignment.aux'. - - (./yenya-text_alignment.tex) [6 - - -] (./pan13-notebook.bbl) [7 - -] -(./pan13-notebook.aux (./simon-source_retrieval.aux) -(./yenya-text_alignment.aux)) - -LaTeX Warning: There were undefined references. - - ) -Here is how much of TeX's memory you used: - 1875 strings out of 94668 - 22776 string characters out of 1175711 - 78690 words of memory out of 1527932 - 4994 multiletter control sequences out of 10000+50000 - 47511 words of font info for 49 fonts, out of 1000000 for 2000 - 458 hyphenation exceptions out of 1000 - 29i,9n,21p,221b,226s stack positions out of 5000i,500n,6000p,200000b,40000s - 63 PDF objects out of 300000 - 0 named destinations out of 131072 - 6 words of extra memory for PDF output out of 65536 -{/export/packages/share/texlive2003/texmf/dvips/psnfss/8r.enc} -Output written on pan13-notebook.pdf (7 pages, 154804 bytes). diff --git a/pan13-paper/pan13-notebook.pdf b/pan13-paper/pan13-notebook.pdf deleted file mode 100644 index d7ce950..0000000 Binary files a/pan13-paper/pan13-notebook.pdf and /dev/null differ diff --git a/pan13-paper/simon-source_retrieval.aux b/pan13-paper/simon-source_retrieval.aux deleted file mode 100644 index 95d16bb..0000000 --- a/pan13-paper/simon-source_retrieval.aux +++ /dev/null @@ -1,57 +0,0 @@ -\relax -\citation{plagCorpus} -\@writefile{toc}{\contentsline {section}{\numberline {2}Source Retrieval}{2}} -\@writefile{lof}{\contentsline {figure}{\numberline {1}{\ignorespaces Source retrieval process.}}{2}} -\newlabel{fig:source_retr_process}{{1}{2}} -\citation{chatnoir} -\citation{suchomel_kas_12} -\citation{Introduction_to_information_retrieval} -\citation{ententen} -\citation{SpiderLink} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.1}Querying}{3}} -\@writefile{toc}{\contentsline {subsubsection}{Keywords Based Queries.}{3}} -\citation{awfc} -\citation{suchomel_kas_12} -\@writefile{toc}{\contentsline {subsubsection}{Intrinsic Plagiarism Based Queries.}{4}} -\@writefile{toc}{\contentsline {subsubsection}{Paragraph Based Queries.}{4}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.2}Search Control}{5}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.3}Result Selection}{5}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.4}Snippet Control}{5}} -\@writefile{toc}{\contentsline {subsection}{\numberline {2.5}Source Retrieval Results}{5}} -\@setckpt{simon-source_retrieval}{ -\setcounter{page}{6} -\setcounter{equation}{0} -\setcounter{enumi}{0} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{0} -\setcounter{footnote}{2} -\setcounter{mpfootnote}{0} -\setcounter{part}{0} -\setcounter{section}{2} -\setcounter{subsection}{5} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{1} -\setcounter{table}{0} -\setcounter{chapter}{1} -\setcounter{@inst}{1} -\setcounter{@auth}{3} -\setcounter{auco}{3} -\setcounter{theorem}{0} -\setcounter{case}{0} -\setcounter{conjecture}{0} -\setcounter{corollary}{0} -\setcounter{definition}{0} -\setcounter{example}{0} -\setcounter{exercise}{0} -\setcounter{lemma}{0} -\setcounter{note}{0} -\setcounter{problem}{0} -\setcounter{property}{0} -\setcounter{proposition}{0} -\setcounter{question}{0} -\setcounter{solution}{0} -\setcounter{remark}{0} -} diff --git a/pan13-paper/simon-source_retrieval.tex b/pan13-paper/simon-source_retrieval.tex index 2cb1a8f..d5b338b 100755 --- a/pan13-paper/simon-source_retrieval.tex +++ b/pan13-paper/simon-source_retrieval.tex @@ -50,7 +50,8 @@ of those parts is done. \subsection{Querying} Querying means to effectively utilize the search engine in order to retrieve as many relevant documents as possible with the minimum amount of queries. We consider the resulting document relevant -if it shares some of text characteristics with the suspicious document. +if it shares some of text characteristics with the suspicious document. In real-world queries as such +represent appreciable cost, therefore their minimization should be one of the top priorities. We used 3 different types of queries\footnote{We used similar three-way based methodology in PAN 2012 Candidate Document Retrieval subtask. However, this time we completely replaced the headers based queries @@ -143,8 +144,31 @@ discovered search engine results were evaluated, but there were executed no more \subsection{Result Selection} +The second main decisive area about source retrieval task is to decide which from the search engine results to download. +This process is represented in figure~\ref{fig:source_retr_process} as 'Selecting'. +Nowadays in real-world is download very cheap and quick operation. There can be some disk space considerations +if there is a need to store original downloaded documents. The main cost represents documents post processing. +Mainly on the Internet there is a wide range of file formats, which for text alignment must be +converted into plaintext. This can time and computational-consuming. For example from many +pdf documents the plain text is hardly extractable, thus one need to use optical character recognition methods. + +The ChatNoir offers snippets for discovered documents. The snippet generation is considered costless +operation. The snippet purpose is to have a quick glance at a small extract of resulting page. +The extract is maximally 500 characters long and it is a portion of the document around given keywords. +On the basis of snippet, we needed to decide whether to actually download the result or not. + +Since the snippet is relatively small and it can be discontinuous part of the text, the +text alignment methods described in section~\ref{text_alignment} were insufficient for + + \subsection{Snippet Control} +\begin{figure} + \centering + \includegraphics[width=1.00\textwidth]{img/snippets_graph.pdf} + \caption{Downloads and similarities performance.} + \label{fig:snippet_graph} +\end{figure} \subsection{Source Retrieval Results} diff --git a/pan13-paper/yenya-text_alignment.aux b/pan13-paper/yenya-text_alignment.aux deleted file mode 100644 index e01f6f1..0000000 --- a/pan13-paper/yenya-text_alignment.aux +++ /dev/null @@ -1,40 +0,0 @@ -\relax -\@writefile{toc}{\contentsline {section}{\numberline {3}Text Alignment}{6}} -\newlabel{text_alignment}{{3}{6}} -\@setckpt{yenya-text_alignment}{ -\setcounter{page}{7} -\setcounter{equation}{0} -\setcounter{enumi}{0} -\setcounter{enumii}{0} -\setcounter{enumiii}{0} -\setcounter{enumiv}{0} -\setcounter{footnote}{2} -\setcounter{mpfootnote}{0} -\setcounter{part}{0} -\setcounter{section}{3} -\setcounter{subsection}{0} -\setcounter{subsubsection}{0} -\setcounter{paragraph}{0} -\setcounter{subparagraph}{0} -\setcounter{figure}{1} -\setcounter{table}{0} -\setcounter{chapter}{1} -\setcounter{@inst}{1} -\setcounter{@auth}{3} -\setcounter{auco}{3} -\setcounter{theorem}{0} -\setcounter{case}{0} -\setcounter{conjecture}{0} -\setcounter{corollary}{0} -\setcounter{definition}{0} -\setcounter{example}{0} -\setcounter{exercise}{0} -\setcounter{lemma}{0} -\setcounter{note}{0} -\setcounter{problem}{0} -\setcounter{property}{0} -\setcounter{proposition}{0} -\setcounter{question}{0} -\setcounter{solution}{0} -\setcounter{remark}{0} -}