我们在某汽车论坛上对于某一型号汽车随机抽取了100条跟帖进行典型意见分析,通过典型意见引擎得出的结果可以看出该汽车在某些问题上面网友直观的评价,看法,以及对这些意见的聚类。
请先下载测试数据:
# -*- encoding: utf-8 -*-
from __future__ import print_function, unicode_literals
import sys
import time
import json
import requests
COMMENTS_ANALYSIS_URL = 'http://api.bosonnlp.com/comments/analysis/'
COMMENTS_PUSH_URL = 'http://api.bosonnlp.com/comments/push/'
COMMENTS_RESULT_URL = 'http://api.bosonnlp.com/comments/result/'
COMMENTS_STATUS_URL = 'http://api.bosonnlp.com/comments/status/'
COMMENTS_CLEAR_URL = 'http://api.bosonnlp.com/comments/clear/'
TASK_ID = 'task_%d' % int(time.time())
# 注意:在测试时请更换为您的API Token
session = requests.Session()
session.headers['X-Token'] = 'YOUR_API_TOKEN'
session.headers['Content-Type'] = 'application/json'
def comments_status():
resp = session.get(COMMENTS_STATUS_URL + TASK_ID)
resp.raise_for_status()
return resp.json()["status"]
def detail_results(idx, comments):
print('=' * 50)
print('第%d组典型意见是:' % (idx + 1))
print(comments['opinion'])
print('-' * 20)
print('共包含%s份文档,意见内容和原文ID如下:' % comments['num'])
for comment, doc_id in comments['list']:
print(comment, doc_id)
def main():
print('任务ID:', TASK_ID)
print('读入数据...')
with open('text_comments.txt', 'rb') as f:
docs = [line.decode('utf-8') for line in f if line]
print('正在上传数据...')
for i in xrange(0, len(docs), 100):
data = json.dumps([{'_id': i + idx, 'text': text} for idx, text in enumerate(docs[i:i+100])])
resp = session.post(COMMENTS_PUSH_URL + TASK_ID, data=data.encode('utf-8'))
resp.raise_for_status()
print('开始分析...')
resp = session.get(COMMENTS_ANALYSIS_URL + TASK_ID)
resp.raise_for_status()
while True:
status = comments_status()
if status == 'DONE':
resp = session.get(COMMENTS_RESULT_URL + TASK_ID)
resp.raise_for_status()
all_comments = resp.json()
print('一共生成了%d组典型意见' % len(all_comments))
sort_all_comments = sorted(all_comments, key=lambda comments: len(comments['list']), reverse=True)
for idx, comments in enumerate(sort_all_comments):
detail_results(idx, comments)
resp = session.get(COMMENTS_CLEAR_URL + TASK_ID)
resp.raise_for_status()
break
elif status == 'NOT FOUND':
print('找不到典型意见任务。')
break
elif status == 'ERROR':
print('任务失败,请稍后重试。')
else:
print('.', end='')
sys.stdout.flush()
time.sleep(0.5)
if __name__ == '__main__':
main()
# -*- encoding: utf-8 -*-
from __future__ import print_function, unicode_literals
from bosonnlp import BosonNLP
# 注意:在测试时请更换为您的API Token
nlp = BosonNLP('YOUR_API_TOKEN')
def print_comments(idx, comments):
print('=' * 50)
print('第%d组典型意见是:' % (idx + 1))
print(comments['opinion'])
print('-' * 20)
print('共包含%s份文档,意见内容和原文ID如下:' % comments['num'])
for comment, doc_id in comments['list']:
print(comment, doc_id)
def main():
with open('text_comments.txt', 'rb') as f:
docs = [line.decode('utf-8') for line in f if line]
all_comments = nlp.comments(docs)
sort_all_comments = sorted(all_comments, key=lambda comments: comments['num'], reverse=True)
for idx, comments in enumerate(sort_all_comments):
print_comments(idx, comments)
if __name__ == '__main__':
main()
详细的Python SDK comments文档请看 这里 。
>>>[{"opinion": 起步离合太高, //第1组典型意见的代表性意见
"list": [['离合稍有点高',88] //包含的意见内容和文档ID
['离合行程太高',110]
['起步离合太高',215]
['双离合效果不理想',229]]
"num": 4 //对应的文档数
}...
查看完整的典型意见结果样例:comments_result.txt