]> www.fi.muni.cz Git - pan13-paper.git/blob - pan13-poster/poster.tex
5e3c9a095b4e30283e5c9b63b798d7325becc268
[pan13-paper.git] / pan13-poster / poster.tex
1 \documentclass[a0,portrait]{sciposter}\r
2 \r
3 \usepackage{epsfig}\r
4 \usepackage{amsmath}\r
5 \usepackage{amssymb}\r
6 \usepackage{multicol}\r
7 \usepackage{bera}\r
8 \usepackage[utf8]{inputenc}\r
9 %\usepackage{fancybullets}\r
10 %\usepackage{floatflt}\r
11 %\usepackage{graphics}\r
12 \r
13 \definecolor{BoxCol}{rgb}{0.9,0.9,1}\r
14 % uncomment for light blue background to \section boxes \r
15 % for use with default option boxedsections\r
16 \r
17 \definecolor{SectionCol}{rgb}{0,0,0.5}\r
18 % uncomment for dark blue \section text \r
19 \r
20 \definecolor{ReallyEmph}{rgb}{0.7,0,0}\r
21 \r
22 \renewcommand{\titlesize}{\Huge}\r
23 \title{Diverse Queries and Feature Type Selection \\ for Plagiarism Discovery}\r
24 \r
25 % Note: only give author names, not institute\r
26 \author{Šimon Suchomel, Jan Kasprzak, and Michal Brandejs}\r
27  \r
28 % insert correct institute name\r
29 \institute{Faculty of Informatics, Masaryk University, Brno, Czech Republic}\r
30 \r
31 % \email{kas@fi.muni.cz}  % shows author email address below institute\r
32 \r
33 %\date is unused by the current \maketitle\r
34 \r
35 \font\logofont=fi-logo600 at .16\textwidth\r
36 \r
37 \renewcommand{\sectionsize}{\Large}\r
38 \r
39 \newcommand{\cemph}[1]{{\sffamily\bfseries\itshape \textcolor{SectionCol}{#1}}}\r
40 \newcommand{\lemph}[1]{{\rmfamily\itshape \textcolor{SectionCol}{#1}}}\r
41 \newcommand{\eitem}[1]{\item \cemph{#1}}\r
42 \r
43 \newenvironment{ytemize}\r
44   { \begin{itemize}\r
45         \setlength{\itemsep}{0pt}\r
46         \setlength{\parskip}{0pt}\r
47   }\r
48   { \end{itemize} }\r
49 \r
50 \conference{{\bf CLEF 2013}, 23--27 September 2013, Valencia, Spain}\r
51 \r
52 \setlength{\figbotskip}{\smallskipamount}\r
53 \r
54 %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%\r
55 %%% Begin of Document\r
56 \r
57 \begin{document}\r
58 \r
59 \r
60 %\LEFTSIDEfootlogo  \r
61 % Uncomment to put footer logo on left side, and \r
62 % conference name on right side of footer\r
63 \r
64 % Some examples of caption control (remove % to check result)\r
65 \r
66 %\renewcommand{\algorithmname}{Algoritme} % for Dutch\r
67 \r
68 %\renewcommand{\mastercapstartstyle}[1]{\textit{\textbf{#1}}}\r
69 %\renewcommand{\algcapstartstyle}[1]{\textsc{\textbf{#1}}}\r
70 %\renewcommand{\algcapbodystyle}{\bfseries}\r
71 %\renewcommand{\thealgorithm}{\Roman{algorithm}}\r
72 \r
73 % \maketitle\r
74 \r
75 \vspace*{-.06\textwidth}\r
76 \r
77 \hbox to \hsize{\r
78 \begin{minipage}[c]{.11\textwidth}\r
79         \vspace{-.75\textwidth}\r
80         \hbox{\hskip -.83\textwidth\includegraphics[width=3\textwidth]{znak_MU_modry}\hskip -\textwidth}\r
81         \vspace{-\textwidth}\r
82 \end{minipage}\r
83 \hfil\r
84 \begin{minipage}[c]{.7\textwidth}\r
85 \begin{center}\r
86       \renewcommand{\baselinestretch}{2.0}\normalsize\r
87       {\titlesize \bf \@title}\par\r
88       \renewcommand{\baselinestretch}{1.0}\normalsize            \r
89       \vspace{0.4\titleskip}\r
90       {\authorsize {\bf\@author} \par}\r
91       {\instsize\r
92        \vspace{0.2\titleskip}\r
93        \theinstitute \par\r
94        \ifthenelse{\equal{\printemail}{}}{%nothing\r
95          }{%\r
96          \vspace{0.2\titleskip}\r
97          \texttt{\printemail}\r
98          }\r
99       }\r
100 \end{center}\r
101 \end{minipage}\r
102 \hfil\r
103 \begin{minipage}[c]{.15\textwidth}\r
104         \hbox to \hsize{\logofont SL\hss}\r
105 \end{minipage}\r
106 }\r
107 \r
108 \vspace{-.02\textwidth}\r
109 \r
110 %%% Begin of Multicols-Enviroment\r
111 %\begin{abstract}\r
112 %{\sffamily\itshape\r
113 %Nějaký abstrakt.\r
114 %}\r
115 %\end{abstract}\r
116 \r
117 \r
118 \begin{multicols}{2}\setlength{\columnseprule}{0pt}\r
119 \section{Introduction}\r
120 %\r
121 PAN 2013 LOrem ipsum Lorem ipsum Lorem ipsumLorem ipsumLorem ipsumLorem ipsumLorem ipsum \r
122 %\r
123 \vfill\r
124 \columnbreak\r
125 %\r
126 \begin{figure}\r
127  \centering\r
128   \includegraphics[width=0.6\textwidth]{img/source_retrieval_process.pdf}\r
129   \caption{Plagiarism discovery process.}\r
130   \label{fig:process}\r
131 \end{figure} \r
132 \end{multicols}\r
133 \begin{multicols}{2}\r
134 %\rm\r
135 %%% Introduction\r
136 \section{Querying}\r
137 Querying means to effectively utilize the search engine in order to retrieve as many relevant\r
138 documents as possible with the minimum amount of queries.\r
139 %We consider the resulting document relevantif it shares some of text characteristics with the suspicious document.\r
140 In real-world queries as such represent appreciable cost, therefore their minimization should be one of the top priorities. \r
141 %\subsection{Types of Queries}\r
142 From the suspicious document, there were three diverse types of queries extracted.\\\r
143 \begin{minipage}{0.55\linewidth}\r
144 \subsection{Keywords Based Queries}\r
145 \begin{ytemize}\r
146 \item TF--IDF base automated keywords extraction;\r
147 \item 5-token long; \r
148 \item Deterministic;\r
149 \item Non-positional;\r
150 \item Non-phrasal.\r
151 \end{ytemize}\r
152 \end{minipage}\r
153 \begin{minipage}{0.45\linewidth}\r
154 \begin{figure}[h]\r
155  %\centering\r
156   \includegraphics[width=1\linewidth]{img/document_keywords.pdf}\r
157 \end{figure}\r
158 \end{minipage}\r
159 \begin{minipage}{0.55\linewidth}\r
160 \subsection{Intrinsic Plagiarism Based Queries}\r
161 \begin{ytemize}\r
162 \item Averaged Word Frequency Class based chunking~\cite{AWFC};\r
163 \item Random sentence selection from the chunk;\r
164 \item Non-deterministic;\r
165 \item Positional;\r
166 \item Phrasal.\r
167 \end{ytemize}\r
168 \end{minipage}\r
169 \begin{minipage}{0.45\linewidth}\r
170 \begin{figure}[h]\r
171  %\centering\r
172   \includegraphics[width=1\linewidth]{img/document_awfc.pdf}\r
173 \end{figure}\r
174 \end{minipage}\r
175 \begin{minipage}{0.55\linewidth}\r
176 \subsection{Paragraph Based Queries}\r
177 \begin{ytemize}\r
178 \item Longest sentences from miscellaneous paragraphs;\r
179 \item Deterministic;\r
180 \item Positional;\r
181 \item Phrasal.\r
182 \end{ytemize}\r
183 \end{minipage}\r
184 \begin{minipage}{0.45\linewidth}\r
185 \begin{figure}[h]\r
186  %\centering\r
187   \includegraphics[width=1\linewidth]{img/document_paragraphs.pdf}\r
188 \end{figure}\r
189 \end{minipage}\r
190 \r
191 \begin{figure}[h]\r
192  \centering\r
193   \includegraphics[width=0.8\linewidth]{img/queryprocess.pdf}\r
194    \caption{Stepwise queries execution process.}\r
195 \end{figure}\r
196 \r
197 \section{Selecting}\r
198 Document snippets were used for deciding whether to download the document for the text alignment.\r
199 We used 2-tuples measurement, which indicates how many neighbouring word pairs coexist in the snippet and in the suspicious document.\r
200 Performance of this measure is depicted at picture~\ref{fig:snippet_graph}.\r
201 Having this measure, a threshold for download decision needs to be set in order to maximize all discovered similarities\r
202 and minimize total downloads.\r
203 A profitable threshold is such that matches with the largest distance between those two curves.\r
204 \begin{figure}\r
205   \centering\r
206   \includegraphics[width=0.8\textwidth]{img/snippets_graph.pdf}\r
207   \caption{Downloads and similarities performance.}\r
208   \label{fig:snippet_graph}\r
209 \end{figure}\r
210 \r
211 \r
212 %\r
213 % Yenyova cast\r
214 %\r
215 \r
216 \section{Text Alignment}\r
217 \r
218 %\r
219 % Spolecna cast\r
220 %\r
221 \r
222 \section{Conclusion}\r
223 \r
224 Nějaký závěr\r
225 \r
226 %%% References\r
227 \r
228 %% Note: use of BibTeX als works!!\r
229 \r
230 \bibliographystyle{plain}\r
231 \begin{thebibliography}{1}\r
232 \r
233 \bibitem{ISMU}\r
234 \cemph{Masaryk University Information System}\\\r
235 {\tt http://is.muni.cz/}, contact: {\tt iscor@fi.muni.cz}.\r
236 \r
237 \bibitem{Theses}\r
238 \cemph{Czech National Archive of Graduate Theses}\\\r
239 {\tt http://theses.cz/}, contact: {\tt theses@fi.muni.cz}.\r
240 \r
241 \bibitem{AWFC}\r
242 \cemph{Sven Meyer Zu Eissen and Benno Stein: Intrinsic Plagiarism Detection}\\\r
243 {\tt Proceedings of the European Conference on Information Retrieval (ECIR-06)}, {\tt 2006}\r
244 \r
245 \end{thebibliography}\r
246 \r
247 \smallskip\r
248 \hrule height .1em\r
249 \medskip\r
250 \r
251 % \sffamily\r
252 \r
253 QR kód?\r
254 \r
255 \cemph{Contact information:}\\\r
256         Šimon Suchomel {\tt suchomel@fi.muni.cz},\\\r
257         Jan Kasprzak, {\tt kas@fi.muni.cz}.\r
258 \r
259 \r
260 \end{multicols}\r
261 \r
262 \end{document}\r
263 \r