Coreseek®  
 | 首页 | 注册 | 回复 | 搜索 | 统计资料 |                 网站首页产品服务开放源码安装使用常见问题中文手册社区交流联系我们 
站务公告 论坛首页 / 站务公告 /

Python数据源操作实例!!!

 
nzinfo
会员
#1 | 发表时间: 2010 07 08 13:56
回复 
测试能否发贴
dancebear
会员
#2 | 发表时间: 2010 07 08 13:59 | 修改: dancebear
回复 
首先谢谢HonestQiao和nzinfo两位同志;对于我对你们不断的骚扰我很抱歉,谢谢
我只列出主要的代码,其他细节请各位自己完善。
此处的ini库的下载地址:http://code.google.com/p/dict4ini/
另外提醒大家,请把所有的关于数据读取的程序都写到你的数据源里,千万别偷懒用继承的方式。可能会生成索引错误的。

如果觉得格式比较错乱的话可以访问这里查看有高亮有格式也是最新的版本:
http://blog.52fq.net/post/97/
可能会比较慢,抱歉

配置文件:
[code=html]
[indexer]
# 每批次读取的文件数目
perpage=1000
[mysql]
#MySQL服务器地址
host= 127.0.0.1
#MySQL 用户名
username=root
#MySQL密码
password=root
#MySQL数据库名
dbname=article
#MySQL 默认字符集
charset=gbk
#MySQL数据表表前缀
tableprefix=cms_
#MySQL服务器端口
port=3306
[/code]
[code=Python]
#! /usr/bin/env python
#coding=utf-8
# coreseek3.2.13 python source文章读取
# author: dancebear
# date: 2010-07-02 11:46
import MySQLdb,logging
confFilePrivate='/usr/local/search/conf/article.conf'
#print [x.lower() for x in sys.path]
class mainSource(object):
def __init__(self,cfg):
self.cfg=ini.DictIniconfFilePrivate)
#setLoggerHandler(self.cfg)
self._rowCount=0
self.m_cursor = None
self.m_dbconn = None
#range query.
self.m_minid = 0
self.m_maxid = 0
self.m_start_id = 0;
#base str
self.m_basesql_str = ''

def GetScheme(self): #获取结构,docid、文本、整数
return [
('id' , {'docid':True, } ),
('fid', { 'type':'integer'} ),
('description', { 'type':'text'} ),
('pink', {'type':'integer'} ),
('author', {'type':'text'} ),
('authorid', {'type':'integer'} ),
('subject', {'type':'text'} ),
('dateline', {'type':'integer'} ),
('state', {'type':'integer'} ),
('top', {'type':'integer'} ),
('content', {'type':'text'} ),
('views', {'type':'integer'} ),
('comments', {'type':'integer'} ),
]

def GetFieldOrder(self): #字段的优先顺序
return ('subject', 'content','description','author')

def Connected(self):
#如果是数据库,则在此处做数据库连接
if not self.cfg.has_key('mysql'):
logging.error('Not has MySQL info')
return False
try:
self.m_dbconn = MySQLdb.connect (host = self.cfg.mysql.host,\
port = int(self.cfg.mysql.port),\
user = self.cfg.mysql.username,\
passwd = self.cfg.mysql.password,\
db = self.cfg.mysql.dbname)
except MySQLdb.Error, e:
logginf.error( "Error %d: %s" , e.args[0], e.args[1])
return False
return True

def OnBeforeIndex(self):
sql = """SELECT MIN(id),MAX(id) FROM {$prefix}contentlist"""
#select max & min doc_id
if self.m_dbconn == None:
return False
if self.m_dbconn == None:
return False
try:
self.m_cursor = self.m_dbconn.cursor ()
#Change DataBase Enocding here, is to support gbk or others.
rowCount = self.m_cursor.execute ("SET NAMES "+self.cfg.mysql.charset)
except MySQLdb.Error, e:
logging.error( "Error %d: %s" ,e.args[0], e.args[1])
return False
self.m_cursor.close ()
sql = sql.replace("{$prefix}",self.cfg.mysql.tableprefix)
try:
self.m_cursor = self.m_dbconn.cursor ()
rowCount = self.m_cursor.execute (sql)
except MySQLdb.Error, e:
logging.error( "Error %d: %s" , e.args[0], e.args[1])
return False
if rowCount == 0:
return False
tm_row = self.m_cursor.fetchone()
#print tm_row[0],tm_row[1]
if tm_row[0]:
self.m_minid = tm_row[0]
if tm_row[1]:
self.m_maxid = tm_row[1]
#self.m_minid = 4667671
self.m_start_id = self.m_minid

#self.m_cursor.close ()
#self.m_cursor = None

sql = """SELECT l.id AS id, l.CateID AS fid,
l.Description AS description,l.pink AS pink,
l.Author as author, l.AuthorID AS poster_id,
l.Title AS title, l.PublicTime AS post_time,
l.State as state,l.Top as top,
c.content AS content,l.HitNum as views, l.CommentNum as comments
FROM {$prefix}contentlist AS l, {$prefix}content AS c WHERE c.ContentID = l.id"""
self.m_basesql_str = sql.replace("{$prefix}",self.cfg.mysql.tableprefix)
return True
def NextDocument(self): #取得每一个文档记录的调用
if self._rowCount<=0:
#do fetch
try:
self.m_cursor = self.m_dbconn.cursor(cursorclass=MySQLdb.cursors.DictCursor)
sql_condition = " AND l.id>="+str(self.m_start_id) + " AND l.id<"+str(self.m_start_id+self.cfg.indexer.perpage)
sql_condition = sql_condition.replace("{$prefix}",self.cfg.mysql.tableprefix)
#print self.m_basesql_str+sql_condition
self._rowCount = self.m_cursor.execute (self.m_basesql_str+sql_condition)-1
self.m_start_id = self.m_start_id + self.cfg.indexer.perpage; #append 1 to avoid doc_id duplicate.
logging.info('Select data from %d to %d',self.m_start_id, self.m_start_id+self.cfg.indexer.perpage)
return self._getRow()
except MySQLdb.Error, e:
logging.error( "Error %d:%s",e.args[0],e.args[1])
return False
else:
self._rowCount-=1
return self._getRow()
def _getRow(self):
m_row=self.m_cursor.fetchone()
if m_row:
#print m_row['id'],
#['pink', 'content', 'post_time', 'description', 'author', 'views', 'top', 'title', 'comments', 'poster_id', 'state', 'fid', 'id']
self.id = m_row['id']
self.fid = m_row['fid']
if m_row['description'] is None:
self.description =''
else:
self.description = m_row['description'].decode(self.cfg.mysql.charset,'ignore').encode('utf-8')
self.pink = m_row['pink']
if m_row['author'] is None:
self.author =''
else:
self.author = m_row['author'].decode(self.cfg.mysql.charset,'ignore').encode('utf-8')
self.authorid = m_row['poster_id']
if m_row['title'] is None:
self.subject =''
else:
self.subject = m_row['title'].decode(self.cfg.mysql.charset,'ignore').encode('utf-8')
self.dateline = m_row['post_time']
self.state = m_row['state']
self.top = m_row['top']
if m_row['content'] is None:
self.content =''
else:
self.content = m_row['content'].decode(self.cfg.mysql.charset,'ignore').encode('utf-8')
self.views = m_row['views']
self.comments = m_row['comments']
return True
else:
return False
def OnIndexFinished(self):
sql = """UPDATE {$prefix}settings SET `value`='{$id}' WHERE variable='idx_postid'"""
sql = sql.replace("{$prefix}", self.cfg.mysql.tableprefix)
sql = sql.replace("{$id}", str(self.m_maxid))
self.Connected()
self.m_dbconn.cursor().execute (sql)
#print sql
logging.info('END fetch')
return True
if __name__ == "__main__": #直接访问演示部分
conf = {}
source = mainSource(conf)
source.Connected()

while source.NextDocument():
print "id=%d, subject=%s ,content =%s" % (source.id, source.subject,source.content[0:20])
pass
[/code]
HonestQiao
会员
#3 | 发表时间: 2010 07 08 16:03
回复 
dancebear
会员
#4 | 发表时间: 2010 07 12 17:16
回复 
注意。delta也就是增量索引的数据源的OnIndexFinished请直接更改为一个空函数,否则可能会出现新的数据覆盖就的数据的问题。
 
回复
Bold Style  Italic Style  Image 链接  URL 链接 
发帖注意:
  • 网址中请去掉http://开头,例如:您需要输入www.coreseek.cn,而不是http://www.coreseek.cn
  • 咨询问题,请贴出详细的操作系统版本、Coreseek版本(Linux环境请给出编译参数)
  • 请仔细查看中文手册和本站安装指南,确认操作正确
  • 请仔细查看常见问题解答,也许你的问题已经有解决方法

» 帐号  » 密码 
发帖前请登陆, 或者 注册 .