代码拉取完成,页面将自动刷新
%% -*- mode: erlang -*-
[
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% SASL config
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
{sasl, [
{sasl_error_logger, {file, "priv/log/sasl-error.log"}},
{errlog_type, error},
{error_logger_mf_dir, "priv/log/sasl"}, % Log directory
{error_logger_mf_maxbytes, 10485760}, % 10 MB max file size
{error_logger_mf_maxfiles, 5} % 5 files max
]
},
{ebot, [
%% !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
%% see EBOT options in ebot.app and add your changes here!
%% !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% CACHE
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% DATABASE
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%%
%% you need to set the db backend (COUCHDB or RIAK)
%% in src/ebot.hrl file
{db_hostname, "127.0.0.1"},
%% COUCHDB
%%{db_port, 5984},
%% RIAK
{db_port, 8087},
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% MQ
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% WEB
%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
%% -------------------------------------------------------------------------------------------------
%% normalize_url_list
%% -------------------------------------------------------------------------------------------------
%%
%% {normalize_url_list, [{RE, NormalizeUrlOptions},..]}
%%
%% options of normalize_url :
%% add_final_slash
%% to_lower_case : urls are case insensive and some web pages have links with some uppercase letters..
%% without_internal_links
%% without_queries,
%% {max_depth, 2}
%% the url path will be truncated to a max_depth path
%% http://www.redaelli.org/matteo/blog/a/ -> http://www.redaelli.org/matteo/blog/
%% should be the same as "tot_new_urls_queues" in ebot_mq.conf
%% you should also start at least one crawler for depth in [0,max_depth]. see "worker_pools" in this file
%% (TODO) {remove_filename, false}
{normalize_url,
[
%% {\\.com/",
%% [
%% {plugin, ebot_url_util, url_domain},
%% add_final_slash,
%% to_lower_case
%% ]
%% },
%% !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
%% default setting for normalize_url
%% !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
%% rememeber, at least one regexp must match all urls
%% "." should be used
{".",
[
%% -------------------------------------------------------------------
%% {plugin, Module, Function/1}
%% -------------------------------------------------------------------
%% you can call a custom module:function(Url) for normaling urls
%% are you interested only in domain homepages?
%% {plugin, ebot_url_util, url_domain},
%% removing blank characters at the begin and end of url string:
%% yes, sometime happens!
strip,
%% -------------------------------------------------------------------
%% {replace_string, [{from,to},..]}
%% -------------------------------------------------------------------
{replace_string, [
%% http://www.gettyre.it/motoweb/XXX;jsessionid=250485C.sae_1
{";[A-Za-z0-9]+=[^&;?]+", ""},
%% some sites have newlines in url links:
%% see http://opensource.linux-mirror.org/index.php
%% TODO maybe it still doesn t work
{"\n",""},
%% http://github.com/dizzyd/ibrowse
{""\$",""}
]},
%% -------------------------------------------------------------------
%% add_final_slash
%% -------------------------------------------------------------------
%% example: http://www.redaelli.org => http://www.redaelli.org/
add_final_slash,
%% -------------------------------------------------------------------
%% {max_depth, 3}
%% -------------------------------------------------------------------
%% paths > max_depth are truncated to max_depth
%% for instance, if {max_depth,0}
%% http://www.redaelli.org/matteo/ => http://www.redaelli.org/
{max_depth, 4},
%% -------------------------------------------------------------------
%% to_lower_case
%% -------------------------------------------------------------------
%% for some web servers web urls are case insensitive.
%% it is safer to lowecase all urls in order to avoid duplicates in the database
%% {plugin, string, to_lower},
%% -------------------------------------------------------------------
%% without_internal_links
%% -------------------------------------------------------------------
%% internal links (#) are removed
without_internal_links,
%% -------------------------------------------------------------------
%% without_queries
%% -------------------------------------------------------------------
%% parameters, like ?a=1&b=3, are removed from urls
without_queries
]}%% end default
] % end list of {regexp, ListOptions}
}, %% end normalize_url
%% -------------------------------------------------------------------------------------------------
%% tobe_saved_headers
%% -------------------------------------------------------------------------------------------------
%% headers (if exist) that will be saved in the database
{tobe_saved_headers,
[
<<"content-length">>,
<<"content-type">>,
<<"server">>,
<<"x-powered-by">>
]},
%% -------------------------------------------------------------------------------------------------
%% is_valid_image
%% -------------------------------------------------------------------------------------------------
%%
%% this option is useful to check links before converting them to absolute urls
%% when they are relative links
%%
{is_valid_image,
[
%% the url will be analyzed if ALL regexps will be satisfied
{validate_all_url_regexps, [
{nomatch, "\.bmp$"},
{nomatch, "\.raw$"}
]
}
]
},
%% -------------------------------------------------------------------------------------------------
%% is_valid_link
%% -------------------------------------------------------------------------------------------------
%%
%% this option is useful to check links before converting them to absolute urls
%% when they are relative links
%%
{is_valid_link,
[
%% the url will be analyzed if ALL regexps will be satisfied
{validate_all_url_regexps, [
{nomatch, "feed:"},
{nomatch, "ftp:"},
{nomatch, "javascript:"},
{nomatch, "mailto:"},
{nomatch, "news:"}
]
}
]
},
%% -------------------------------------------------------------------------------------------------
%% is_valid_url
%% -------------------------------------------------------------------------------------------------
%%
{is_valid_url,
[
%% you can call your custom function that wull return true or false
%% {plugin, Module, function},
%% silly function : {plugin, erlang, is_list},
%% an url is valid if its mime type satify any of the following regexps
{validate_any_mime_regexps, [
{match, "^text/"}
%%,{match, "^image/"}
]
},
%% the url will be analyzed if ALL the following regexps will be satisfied
{validate_all_url_regexps, [
{match, "^http://"},
%% {nomatch, "^https"},
{nomatch, "//.+//"},
{nomatch, "/bugs/"},
{nomatch, "viewcvs"},
%% Skipping Apache.org urls
{nomatch, "\\.apache\\..+/dist/"},
{nomatch, "/snapshots/"},
{nomatch, "^http://mail-archives"},
{nomatch, "bugs.+/.+"},
%% apache mirror sites.. TODO
{nomatch, "apache\\.fastbull\\.org/.+"},
%% Skipping unwanted files
{nomatch, "\\.deb$"},
{nomatch, "\\.git$"},
{nomatch, "\\.tgz$"},
{nomatch, "\\.jar$"},
{nomatch, "\\.rpm$"},
{nomatch, "\\.tar$"},
{nomatch, "\\.gz$"},
{nomatch, "\\.makefile$"},
{nomatch, "\\.Makefile$"},
% Skipping CVS repositories
{nomatch, "/cvs/\\."},
%% Skipping Github unseful pages
{nomatch, "github\\.+/issues"},
{nomatch, "gist\\.github\\.com"},
%% the page gives incomplete header
{nomatch, "svn\\.github\\.com"},
%% Skipping Gitorious unseful pages
{nomatch, "git.+/merge_requests/"},
{nomatch, "git.+/commits/"},
{nomatch, "git.+/trees/"},
%% Skipping Git repositories
{nomatch, "git.+/commit/"},
{nomatch, "git.+/tree/"},
%% Skipping HG repositories
{nomatch, "/changeset/"},
%% Skipping SVN repositories
{nomatch, "svn.+/viewvc/.+/"},
{nomatch, "/svn[\\./]"},
{nomatch, "/branches"},
{nomatch, "/trunk"},
{nomatch, "/tags"}
]
}, %% end of validate_all_regexps
%% The url will be analyzed if ANY of the following regexps will be satisfied.
%% Here you should put the list of web sites to be visited by ebot.
{validate_any_url_regexps, [
%% at least one regexp must be defined
%% {match,"."},
{match, "redaelli\\.org"},
%% Opensource projects
{match, "apache\\.org"},
{match, "freshmeat\\.net"},
{match, "github\\.com"},
{match, "code\\.google\\.com"},
{match, "sourceforge\\.net"},
{match, "ohloh\\.net"},
{match, "bitbucket\\.org"},
{match, "www\\.gettyre\\.it"},
{match, "www\\.tyres-pneus-online\\.co\\.uk"}
]
}
] %% end list of options
} %% end is_valid_url
,
%% -------------------------------------------------------------------------------------------------
%% obsolete_urls_after_day
%% -------------------------------------------------------------------------------------------------
%%
%% after how many days, an url that is stored in the DB will become obsolete
{obsolete_urls_after_days, 10},
%% -------------------------------------------------------------------------------------------------
%% save_referrals
%% -------------------------------------------------------------------------------------------------
%%
%% (cumulable) values: external, domain, subdomain
%%
%% domain: means same domain,
%% es: true <= http://www.redaelli.org/a and http://www.redaelli.org/
%% false <= http://www.redaelli.org/a and http://redaelli.org/
%%
%% subdomain: means same main domain but not same domain,
%% es: false <= http://www.redaelli.org/a and http://www.redaelli.org/
%% true <= http://www.redaelli.org/a and http://redaelli.org/
%%
%% external: means samenot same domain and not same main domain
%% es: false <= http://www.redaelli.org/a and http://www.redaelli.org/
%% false <= http://www.redaelli.org/a and http://redaelli.org/
%% true <= http://www.redaelli.org/a and http://matteoredaelli.wordpress.com/
{save_referrals, [external]},
%% -------------------------------------------------------------------------------------------------
%% workers_pool
%% -------------------------------------------------------------------------------------------------
%%
%% how many crawler threads will be started for each candidated url queue/depth
%% {worker_pools, [{0,3},{1,2},{2,1}]} means
%% 3 crawlers will analyze urls got from AMQP queue ebot.new.0 that countains urls with depth==0
%% (ex. http://www.redaelli.org, http://www.redaelli.org/index.html)
%% 2 crawlers will analyze urls got from AMQP queue ebot.new.1 that countains urls with depth==1
%% (ex. http://www.redaelli.org/matteo/, http://www.redaelli.org/matteo/index.html)
%% 1 crawlers will analyze urls got from AMQP queue ebot.new.2 that countains urls with depth==2
%%{workers_pool, [{0,4}, {1,2}, {2,1}] },
{workers_pool, [{0,2}, {1,2}, {2,2}, {3,2}, {4,2} ] },
%% -------------------------------------------------------------------------------------------------
%% start_workers_at_boot
%% -------------------------------------------------------------------------------------------------
%%
%% are the crawlers started automatically at boot time?
{start_workers_at_boot, true},
%% -------------------------------------------------------------------------------------------------
%% workers_sleep_time
%% -------------------------------------------------------------------------------------------------
%%
%% how many milliseconds will each crawler sleep between two url crawls?
%% this option is useful in order to avoid heavy workloads for the visited websites
%% and for the ebot system if the hardware is not enough powerful
{workers_sleep_time, 2000}
]
}
].
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。